diff --git a/sql/2025/third-parties/consent_signal_basic_analysis.sql b/sql/2025/third-parties/consent_signal_basic_analysis.sql new file mode 100644 index 00000000000..aac8ad9dd53 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_basic_analysis.sql @@ -0,0 +1,90 @@ +#standardSQL +# Basic consent signal analysis (simplified version to ensure data returns) + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + rank <= 50000 -- Expand to top 50K sites +), + +-- Find requests with consent signals (no redirect filtering) +consent_requests AS ( + SELECT + r.client, + r.page, + r.url, + NET.REG_DOMAIN(r.page) AS page_domain, + NET.REG_DOMAIN(r.url) AS url_domain, + + -- Extract consent signals + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard, + + -- Check if request has redirects + JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL AND + TO_JSON_STRING(JSON_EXTRACT(r.summary, '$.redirects')) != '[]' AS has_redirects + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-07-01' AND + NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) AND -- Third-party only + ( + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') OR + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') OR + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') + ) +), + +-- Add any consent signal flag +requests_with_signals AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_signal + FROM + consent_requests +) + +-- Basic analysis +SELECT + client, + + -- Overall counts + COUNT(0) AS total_requests_with_consent_signals, + COUNT(DISTINCT page) AS total_pages_with_consent_signals, + COUNT(DISTINCT url_domain) AS total_domains_with_consent_signals, + + -- Signal type breakdown + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + + -- Percentage breakdown + COUNTIF(has_usp_standard) / COUNT(0) AS pct_usp_standard, + COUNTIF(has_usp_nonstandard) / COUNT(0) AS pct_usp_nonstandard, + COUNTIF(has_tcf_standard) / COUNT(0) AS pct_tcf_standard, + COUNTIF(has_gpp_standard) / COUNT(0) AS pct_gpp_standard, + + -- Redirect availability + COUNTIF(has_redirects) AS requests_with_redirects, + COUNTIF(has_redirects) / COUNT(0) AS pct_requests_with_redirects + +FROM + requests_with_signals +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql b/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql new file mode 100644 index 00000000000..ca72152d977 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql @@ -0,0 +1,195 @@ +#standardSQL +# Consent signal prevalence broken down by third-party category + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +-- Get total requests per category and rank grouping for percentage calculations +category_totals AS ( + SELECT + r.client, + rank_grouping, + tp.category, + COUNT(0) AS total_category_requests, + COUNT(DISTINCT r.page) AS total_category_pages, + COUNT(DISTINCT tp.canonicalDomain) AS total_category_domains + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping, + tp.category +), + +-- Extract consent signals from third-party requests +consent_signals_by_category AS ( + SELECT + r.client, + rank_grouping, + tp.category, + tp.canonicalDomain, + r.page, + r.url, + + -- Single-pass consent signal detection + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard + + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping AND + -- Pre-filter: only process URLs that might contain consent-related parameters + REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +-- Add computed flag for any consent signal +signals_with_any AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal + FROM + consent_signals_by_category +), + +-- Aggregate consent signals by category +category_signal_aggregates AS ( + SELECT + client, + rank_grouping, + category, + + -- USP Standard metrics + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains, + + -- USP Non-Standard metrics + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains, + + -- TCF Standard metrics + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains, + + -- GPP Standard metrics + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains, + + -- Any consent signal metrics + COUNTIF(has_any_consent_signal) AS any_consent_requests, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains, + + -- Totals for this filtered dataset + COUNT(0) AS total_filtered_requests + FROM + signals_with_any + GROUP BY + client, + rank_grouping, + category +) + +-- Final output using UNNEST to avoid repetitive UNION ALL +SELECT + agg.client, + agg.rank_grouping, + agg.category, + signal_data.signal_type, + signal_data.requests_with_signal, + totals.total_category_requests, + signal_data.requests_with_signal / totals.total_category_requests AS pct_requests_with_signal, + signal_data.pages_with_signal, + totals.total_category_pages, + signal_data.pages_with_signal / totals.total_category_pages AS pct_pages_with_signal, + signal_data.domains_with_signal, + totals.total_category_domains, + signal_data.domains_with_signal / totals.total_category_domains AS pct_domains_with_signal +FROM + category_signal_aggregates agg +JOIN + category_totals totals +USING (client, rank_grouping, category) +CROSS JOIN + UNNEST([ + STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal), + STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal), + STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal), + STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal), + STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal) + ]) AS signal_data +WHERE + signal_data.requests_with_signal > 0 -- Only show categories with consent signals + +ORDER BY + client, + rank_grouping, + category, + signal_type diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_chains.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_chains.sql new file mode 100644 index 00000000000..eb7e2da6163 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_chains.sql @@ -0,0 +1,215 @@ +#standardSQL +# Optimized: Consent signal survival rate through inclusion chains (memory-efficient) + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL +> +LANGUAGE js AS """ + try { + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.has_any_signal = signals.has_usp_standard || + signals.has_usp_nonstandard || + signals.has_tcf_standard || + signals.has_gpp_standard; + + return signals; + } catch (e) { + return { + has_usp_standard: false, + has_usp_nonstandard: false, + has_tcf_standard: false, + has_gpp_standard: false, + has_any_signal: false + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + rank <= 10000 -- Aggressive filtering: top 10K only +), + +-- Pre-filter to only requests with consent signals or initiator info +filtered_requests AS ( + SELECT + r.client, + r.page, + r.url, + NET.REG_DOMAIN(r.page) AS root_page, + NET.REG_DOMAIN(r.url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(r.payload, '$._initiator')) AS initiator_etld, + extractConsentSignals(r.url) AS consent_signals + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-07-01' AND + NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) AND -- Third-party only + ( + -- Only process requests with consent signals OR that are part of chains + REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') OR + JSON_VALUE(r.payload, '$._initiator') IS NOT NULL + ) +), + +-- Simplified two-step chain analysis (avoid complex recursion) +step_1_requests AS ( + SELECT + client, + root_page, + third_party, + consent_signals, + COUNT(0) AS step1_count + FROM + filtered_requests + WHERE + initiator_etld = root_page AND -- Direct first-party to third-party requests + consent_signals.has_any_signal = TRUE + GROUP BY + client, + root_page, + third_party, + consent_signals +), + +step_2_requests AS ( + SELECT + fr.client, + s1.root_page, + s1.third_party AS step1_party, + fr.third_party AS step2_party, + s1.consent_signals AS step1_signals, + fr.consent_signals AS step2_signals, + COUNT(0) AS step2_count + FROM + filtered_requests fr + INNER JOIN + step_1_requests s1 + ON + fr.client = s1.client AND + fr.root_page = s1.root_page AND + fr.initiator_etld = s1.third_party -- Third-party chain + GROUP BY + fr.client, + s1.root_page, + s1.third_party, + fr.third_party, + s1.consent_signals, + fr.consent_signals +), + +-- Calculate survival stats by step +step_1_stats AS ( + SELECT + client, + 1 AS step_number, + + COUNTIF(consent_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(consent_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(consent_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(consent_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(consent_signals.has_any_signal) AS any_signal_count, + + COUNT(0) AS total_requests, + COUNT(DISTINCT root_page) AS total_pages + FROM + step_1_requests + GROUP BY + client +), + +step_2_stats AS ( + SELECT + client, + 2 AS step_number, + + COUNTIF(step2_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step2_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step2_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step2_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step2_signals.has_any_signal) AS any_signal_count, + + COUNT(0) AS total_requests, + COUNT(DISTINCT root_page) AS total_pages + FROM + step_2_requests + GROUP BY + client +), + +-- Combine step statistics +combined_stats AS ( + SELECT * FROM step_1_stats + UNION ALL + SELECT * FROM step_2_stats +), + +-- Get baselines for survival rate calculation +baselines AS ( + SELECT + client, + usp_standard_count AS usp_standard_baseline, + usp_nonstandard_count AS usp_nonstandard_baseline, + tcf_standard_count AS tcf_standard_baseline, + gpp_standard_count AS gpp_standard_baseline, + any_signal_count AS any_signal_baseline + FROM + combined_stats + WHERE + step_number = 1 +) + +-- Final survival rate output (simplified) +SELECT + cs.client, + cs.step_number, + cs.total_requests, + cs.total_pages, + + -- Signal counts and survival rates + cs.usp_standard_count, + SAFE_DIVIDE(cs.usp_standard_count, b.usp_standard_baseline) AS usp_standard_survival_rate, + + cs.usp_nonstandard_count, + SAFE_DIVIDE(cs.usp_nonstandard_count, b.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate, + + cs.tcf_standard_count, + SAFE_DIVIDE(cs.tcf_standard_count, b.tcf_standard_baseline) AS tcf_standard_survival_rate, + + cs.gpp_standard_count, + SAFE_DIVIDE(cs.gpp_standard_count, b.gpp_standard_baseline) AS gpp_standard_survival_rate, + + cs.any_signal_count, + SAFE_DIVIDE(cs.any_signal_count, b.any_signal_baseline) AS any_signal_survival_rate + +FROM + combined_stats cs +JOIN + baselines b +USING (client) + +ORDER BY + client, + step_number diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql new file mode 100644 index 00000000000..f2bc1d609bc --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql @@ -0,0 +1,225 @@ +#standardSQL +# Consent signal survival rate through HTTP redirects (memory-efficient) + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL, + signal_count INT64 +> +LANGUAGE js AS """ + try { + if (!url || typeof url !== 'string') return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.signal_count = [ + signals.has_usp_standard, signals.has_usp_nonstandard, + signals.has_tcf_standard, signals.has_gpp_standard + ].filter(Boolean).length; + + signals.has_any_signal = signals.signal_count > 0; + return signals; + } catch (e) { + return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + rank <= 100000 -- Expanded to top 100K sites +), + +-- Pre-filter requests with redirects and potential consent signals +requests_with_redirects AS ( + SELECT + r.client, + r.page, + r.url AS final_url, + JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') AS redirect_url, + NET.REG_DOMAIN(r.url) AS final_domain + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-07-01' AND + NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) AND -- Third-party only + JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') IS NOT NULL AND + JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') != '' AND + ( + -- Pre-filter: only URLs with consent signals in final URL or redirect URL + REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') OR + REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl'), r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') + ) +), + +-- Simplified redirect parsing - 2 step analysis +redirect_steps AS ( + SELECT + client, + page, + final_url, + final_domain, + + -- Step 1: Original redirect URL (before redirect) + redirect_url AS step1_url, + + -- Step 2: Final URL (after redirect) + final_url AS step2_url + FROM + requests_with_redirects + WHERE + redirect_url IS NOT NULL AND + redirect_url != '' +), + +-- Extract consent signals for each step +signals_by_step AS ( + SELECT + client, + page, + final_domain, + + -- Step 1 signals (original redirect URL) + step1_url, + extractConsentSignals(step1_url) AS step1_signals, + + -- Step 2 signals (final URL after redirect) + step2_url, + extractConsentSignals(step2_url) AS step2_signals + FROM + redirect_steps + WHERE + step1_url IS NOT NULL +), + +-- Calculate step-wise aggregations (memory efficient) +step_aggregations AS ( + -- Step 1 stats (original redirect URL) + SELECT + client, + 1 AS redirect_step, + 'original' AS step_type, + + COUNTIF(step1_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step1_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step1_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step1_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step1_signals.has_any_signal) AS any_signal_count, + + AVG(step1_signals.signal_count) AS avg_signal_count, + COUNT(0) AS total_urls, + COUNT(DISTINCT page) AS total_pages + FROM + signals_by_step + WHERE + step1_signals.has_any_signal = TRUE -- Only analyze chains that start with signals + GROUP BY + client + + UNION ALL + + -- Step 2 stats (final URL after redirect) + SELECT + client, + 2 AS redirect_step, + 'final' AS step_type, + + COUNTIF(step2_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step2_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step2_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step2_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step2_signals.has_any_signal) AS any_signal_count, + + AVG(step2_signals.signal_count) AS avg_signal_count, + COUNT(0) AS total_urls, + COUNT(DISTINCT page) AS total_pages + FROM + signals_by_step + WHERE + step1_signals.has_any_signal = TRUE -- Same baseline + GROUP BY + client +), + +-- Calculate baselines (step 1) +baselines AS ( + SELECT + client, + usp_standard_count AS usp_standard_baseline, + usp_nonstandard_count AS usp_nonstandard_baseline, + tcf_standard_count AS tcf_standard_baseline, + gpp_standard_count AS gpp_standard_baseline, + any_signal_count AS any_signal_baseline, + avg_signal_count AS avg_signal_count_baseline + FROM + step_aggregations + WHERE + redirect_step = 1 +) + +-- Final output with survival rates +SELECT + sa.client, + sa.redirect_step, + sa.step_type, + sa.total_urls, + sa.total_pages, + + -- Signal survival rates + sa.usp_standard_count, + SAFE_DIVIDE(sa.usp_standard_count, b.usp_standard_baseline) AS usp_standard_survival_rate, + + sa.usp_nonstandard_count, + SAFE_DIVIDE(sa.usp_nonstandard_count, b.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate, + + sa.tcf_standard_count, + SAFE_DIVIDE(sa.tcf_standard_count, b.tcf_standard_baseline) AS tcf_standard_survival_rate, + + sa.gpp_standard_count, + SAFE_DIVIDE(sa.gpp_standard_count, b.gpp_standard_baseline) AS gpp_standard_survival_rate, + + sa.any_signal_count, + SAFE_DIVIDE(sa.any_signal_count, b.any_signal_baseline) AS any_signal_survival_rate, + + -- Signal count preservation + sa.avg_signal_count, + b.avg_signal_count_baseline, + sa.avg_signal_count - b.avg_signal_count_baseline AS signal_count_change, + SAFE_DIVIDE(sa.avg_signal_count, b.avg_signal_count_baseline) AS signal_count_retention_rate + +FROM + step_aggregations sa +JOIN + baselines b +USING (client) + +ORDER BY + client, + redirect_step diff --git a/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql b/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql new file mode 100644 index 00000000000..cbc7da0a4e5 --- /dev/null +++ b/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql @@ -0,0 +1,201 @@ +#standardSQL +# Detailed breakdown of consent signals by individual parameters and top domains + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + -- Pre-filter: only process URLs that contain consent-related parameters + REGEXP_CONTAINS(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +-- Single-pass parameter extraction using one comprehensive regex +parameter_extraction AS ( + SELECT + r.client, + canonicalDomain, + category, + rank_grouping, + -- Extract all relevant parameters in one pass using REGEXP_EXTRACT_ALL + REGEXP_EXTRACT_ALL(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') AS found_parameters + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping +), + +-- Flatten parameters and count occurrences +flattened_parameters AS ( + SELECT + client, + canonicalDomain, + category, + rank_grouping, + param + FROM + parameter_extraction, + UNNEST(found_parameters) AS param +), + +-- Aggregate parameter counts +parameter_counts AS ( + SELECT + client, + canonicalDomain, + category, + rank_grouping, + param, + COUNT(0) AS param_count, + COUNT(DISTINCT CONCAT(client, canonicalDomain)) AS domain_count + FROM + flattened_parameters + GROUP BY + client, + canonicalDomain, + category, + rank_grouping, + param +), + +-- Get total request counts for percentage calculations (from ALL third-party requests, not pre-filtered) +totals AS ( + SELECT + r.client, + rank_grouping, + COUNT(0) AS total_all_requests + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + r.date = '2025-07-01' AND + p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping +), + +-- Categorize parameters +categorized_params AS ( + SELECT + client, + rank_grouping, + param, + CASE + WHEN param = 'us_privacy' THEN 'USP Standard' + WHEN param IN ('ccpa', 'usp_consent', 'uspString', 'uspConsent', 'ccpa_consent', 'usp', 'usprivacy', 'ccpaconsent', 'usp_string') THEN 'USP Non-Standard' + WHEN param IN ('gdpr', 'gdpr_consent', 'gdpr_pd') THEN 'TCF Standard' + WHEN param IN ('gpp', 'gpp_sid') THEN 'GPP Standard' + END AS signal_category, + SUM(param_count) AS total_requests, + COUNT(DISTINCT canonicalDomain) AS domains_using + FROM + parameter_counts + GROUP BY + client, + rank_grouping, + param, + signal_category +) + +-- Parameter frequency analysis +SELECT + 'Parameter Frequency' AS analysis_type, + client, + rank_grouping, + param AS parameter_name, + signal_category, + total_requests, + domains_using, + total_requests / totals.total_all_requests AS pct_of_all_requests +FROM + categorized_params +JOIN + totals +USING (client, rank_grouping) + +UNION ALL + +-- Top domains analysis (simplified) +SELECT + 'Top Domains' AS analysis_type, + client, + rank_grouping, + canonicalDomain AS parameter_name, + category AS signal_category, + SUM(param_count) AS total_requests, + COUNT(DISTINCT param) AS domains_using, + SUM(param_count) / MAX(totals.total_all_requests) AS pct_of_all_requests +FROM + parameter_counts +JOIN + totals +USING (client, rank_grouping) +GROUP BY + client, + rank_grouping, + canonicalDomain, + category +HAVING + SUM(param_count) > 0 + +ORDER BY + analysis_type, + client, + rank_grouping, + total_requests DESC +LIMIT 1000 diff --git a/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql b/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql new file mode 100644 index 00000000000..fdd8364f9d5 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql @@ -0,0 +1,65 @@ +CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY>) +RETURNS ARRAY +LANGUAGE js AS """ + // Helper function to find all initiator_etlds for a given root_page + function findInitiators(page, visited, data) { + // Find all entries where the root_page matches and the initiator_etld hasn't been visited + const initiators = data + .filter(row => row.root_page === page && !visited.includes(row.initiator_etld)) + .map(row => row.initiator_etld); + + // Add the newly found initiators to the visited list + visited = visited.concat(initiators); + + // Recursively process all new initiators + initiators.forEach(initiator => { + visited = findInitiators(initiator, visited, data); + }); + + return visited; + } + + // Main call: Start recursion from the rootPage + // Use a Set to ensure that all returned values are distinct + return Array.from(new Set(findInitiators(rootPage, [], data))); +"""; + +WITH data AS ( + -- TP interact with other tps - only extract necessary fields + SELECT + client, + root_page, + third_party, + initiator_etld + FROM ( + SELECT + client, + NET.REG_DOMAIN(root_page) AS root_page, + NET.REG_DOMAIN(url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) + ) + WHERE third_party != initiator_etld AND + root_page != initiator_etld + GROUP BY client, root_page, third_party, initiator_etld +) + +SELECT + client, + ARRAY_LENGTH(all_initiators) AS chain_length, + COUNT(0) AS pages_with_this_length +FROM ( + SELECT + root_page, + client, + findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators + FROM data + GROUP BY root_page, client +) +WHERE ARRAY_LENGTH(all_initiators) > 0 +GROUP BY client, chain_length +ORDER BY client, chain_length; diff --git a/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql new file mode 100644 index 00000000000..7e2eb6165b6 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql @@ -0,0 +1,66 @@ +#standardSQL +# Distribution of websites by number of third party + +-- updated for crawl.requests +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/length_of_chain_by_intiator.sql b/sql/2025/third-parties/length_of_chain_by_intiator.sql new file mode 100644 index 00000000000..06bb39ecf6e --- /dev/null +++ b/sql/2025/third-parties/length_of_chain_by_intiator.sql @@ -0,0 +1,72 @@ +CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY>) +RETURNS ARRAY +LANGUAGE js AS """ + // Helper function to find all initiator_etlds for a given root_page + function findInitiators(page, visited, data) { + // Find all entries where the root_page matches and the initiator_etld hasn't been visited + const initiators = data + .filter(row => row.root_page === page && !visited.includes(row.initiator_etld)) + .map(row => row.initiator_etld); + + // Add the newly found initiators to the visited list + visited = visited.concat(initiators); + + // Recursively process all new initiators + initiators.forEach(initiator => { + visited = findInitiators(initiator, visited, data); + }); + + return visited; + } + + // Main call: Start recursion from the rootPage + // Use a Set to ensure that all returned values are distinct + return Array.from(new Set(findInitiators(rootPage, [], data))); +"""; + +WITH data AS ( + -- TP interact with other tps + SELECT + * + FROM ( + SELECT + client, + NET.REG_DOMAIN(root_page) AS root_page, + NET.REG_DOMAIN(url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld + FROM + `httparchive.crawl.requests` + WHERE + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND + date = '2025-07-01' + ) + WHERE third_party != initiator_etld AND + root_page != initiator_etld + GROUP BY client, root_page, third_party, initiator_etld +) + +-- Add this to the final SELECT to see top initiators by chain length +SELECT + client, + first_initiator, + AVG(ARRAY_LENGTH(all_initiators)) AS avg_chain_length, + MAX(ARRAY_LENGTH(all_initiators)) AS max_chain_length, + COUNT(0) AS pages +FROM ( + SELECT + root_page, + client, + all_initiators, + all_initiators[OFFSET(0)] AS first_initiator -- First third-party in chain + FROM ( + SELECT + root_page, + client, + findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators + FROM data + GROUP BY root_page, client + ) + WHERE ARRAY_LENGTH(all_initiators) > 0 +) +GROUP BY client, first_initiator +ORDER BY avg_chain_length DESC; diff --git a/sql/2025/third-parties/number_of_third_parties_by_rank.sql b/sql/2025/third-parties/number_of_third_parties_by_rank.sql new file mode 100644 index 00000000000..3e858d5c99b --- /dev/null +++ b/sql/2025/third-parties/number_of_third_parties_by_rank.sql @@ -0,0 +1,83 @@ +#standardSQL +# Number of third-parties per websites by rank + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` AS pg + WHERE + pg.date = '2025-07-01' AND + pg.is_root_page = true +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql b/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql new file mode 100644 index 00000000000..0e3e8711004 --- /dev/null +++ b/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql @@ -0,0 +1,94 @@ +#standardSQL +# Number of third-parties per websites by rank and category + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + is_root_page +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + CASE + WHEN rank_grouping = 100000000 THEN 'all' + ELSE FORMAT("%'d", rank_grouping) + END AS ranking, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql b/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql new file mode 100644 index 00000000000..f786b83715a --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql @@ -0,0 +1,88 @@ +#standardSQL +# Number of third-party providers per websites by rank and category + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql b/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql new file mode 100644 index 00000000000..c63c3ba89cb --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql @@ -0,0 +1,61 @@ +#standardSQL +# Number of third-party requests by rank + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.all.requests` AS req + WHERE + req.date = '2025-07-01' AND + req.is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.all.pages` AS pg + WHERE + pg.date = '2025-07-01' AND + pg.is_root_page = true +), + +third_party AS ( + SELECT + tp.client, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests, + rank_grouping + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(tp.page) = NET.HOST(r.page) AND tp.client = r.client + CROSS JOIN UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + tp.rank <= rank_grouping + GROUP BY + tp.client, + tp.rank, + rank_grouping +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS median_distinct_tp_requests, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS median_tp_requests +FROM + third_party +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; diff --git a/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql b/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql new file mode 100644 index 00000000000..cfc54316b1b --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql @@ -0,0 +1,61 @@ +#standardSQL +# Number of third-party requests per page by rank + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = true +), + +third_party AS ( + SELECT + tp.client, + tp.page, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(tp.page) = NET.HOST(r.page) AND tp.client = r.client + GROUP BY + tp.client, + tp.page, + tp.rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS p50_distinct_tp_requests_per_page, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS p50_tp_requests_per_page +FROM + third_party, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; diff --git a/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql new file mode 100644 index 00000000000..18481200c68 --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql @@ -0,0 +1,55 @@ +#standardSQL +# Percent of third party requests by content type. + +WITH requests AS ( + SELECT + client, + page, + url, + type AS contentType + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +) + +SELECT + client, + contentType, + COUNT(0) AS requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + contentType +ORDER BY + client, + contentType diff --git a/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql new file mode 100644 index 00000000000..ee4ff6a1052 --- /dev/null +++ b/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -0,0 +1,68 @@ +#standardSQL +# Percent of websites with third parties by ranking + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +) + +SELECT + client, + rank_grouping, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party +FROM + pages +JOIN + requests +USING (client, page) +LEFT JOIN + third_party +ON NET.HOST(requests.url) = NET.HOST(third_party.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql b/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql new file mode 100644 index 00000000000..580c8df3002 --- /dev/null +++ b/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql @@ -0,0 +1,193 @@ +#standardSQL +# Prevalence of specific consent signals (USP, TCF, GPP) in third-party requests + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +totals AS ( + SELECT + r.client, + rank_grouping, + COUNT(DISTINCT r.page) AS total_pages, + COUNT(0) AS total_requests + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping +), + +third_party_requests AS ( + SELECT + r.client, + r.page, + r.url, + canonicalDomain, + category, + rank_grouping + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping +), + +-- Single-pass consent signal detection using one comprehensive regex +consent_signals AS ( + SELECT + client, + page, + url, + canonicalDomain, + category, + rank_grouping, + + -- Extract all consent parameters in one pass + REGEXP_EXTRACT_ALL(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') AS found_params, + + -- Boolean flags derived from the extracted parameters (computed once) + REGEXP_CONTAINS(url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard + FROM + third_party_requests + WHERE + -- Pre-filter to reduce data processing + REGEXP_CONTAINS(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +-- Add computed flag for any consent signal +signals_with_any AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal + FROM + consent_signals +), + +-- Create a single aggregation table to avoid repetitive calculations +signal_aggregates AS ( + SELECT + client, + rank_grouping, + -- USP Standard metrics + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains, + + -- USP Non-Standard metrics + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains, + + -- TCF Standard metrics + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains, + + -- GPP Standard metrics + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains, + + -- Any consent signal metrics + COUNTIF(has_any_consent_signal) AS any_consent_requests, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains, + + -- Totals for this filtered dataset + COUNT(0) AS total_third_party_requests + FROM + signals_with_any + GROUP BY + client, + rank_grouping +) + +-- Final output using UNNEST to avoid repetitive UNION ALL +SELECT + agg.client, + agg.rank_grouping, + signal_data.signal_type, + signal_data.requests_with_signal, + agg.total_third_party_requests, + signal_data.requests_with_signal / agg.total_third_party_requests AS pct_requests_with_signal, + signal_data.pages_with_signal, + totals.total_pages, + signal_data.pages_with_signal / totals.total_pages AS pct_pages_with_signal, + signal_data.domains_with_signal +FROM + signal_aggregates agg +JOIN + totals +USING (client, rank_grouping) +CROSS JOIN + UNNEST([ + STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal), + STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal), + STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal), + STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal), + STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal) + ]) AS signal_data + +ORDER BY + client, + rank_grouping, + signal_type diff --git a/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..dc915aa07d8 --- /dev/null +++ b/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql @@ -0,0 +1,80 @@ +#standardSQL +# Top 100 third parties by number of websites + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-07-01' + GROUP BY + client +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 5 +) + +SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages, + total_pages, + COUNT(DISTINCT page) / total_pages AS pct_pages, + COUNT(0) AS requests, + total_requests, + COUNT(0) / total_requests AS pct_requests, + DENSE_RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS sorted_order +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +JOIN + totals +USING (client) +WHERE + canonicalDomain IS NOT NULL +GROUP BY + client, + total_pages, + total_requests, + canonicalDomain +QUALIFY + sorted_order <= 100 +ORDER BY + pct_pages DESC, + client diff --git a/src/config/2025.json b/src/config/2025.json index eb8e9910749..6785af68a8b 100644 --- a/src/config/2025.json +++ b/src/config/2025.json @@ -45,8 +45,7 @@ "part": "I", "chapter_number": "5", "title": "Third Parties", - "slug": "third-parties", - "todo": true + "slug": "third-parties" }, { "part": "I", diff --git a/src/config/contributors.json b/src/config/contributors.json index 7146a13a289..7cb9de4bd18 100644 --- a/src/config/contributors.json +++ b/src/config/contributors.json @@ -715,7 +715,8 @@ "analysts", "committee", "developers", - "editors" + "editors", + "reviewers" ] }, "twitter": "tunetheweb", @@ -3436,6 +3437,27 @@ ] } }, + "abubakaraziz": { + "avatar_url": "10674679", + "github": "abubakaraziz", + "linkedin": "aziz313f", + "name": "Muhammad Abu Bakar Aziz", + "teams": { + "2025": [ + "authors" + ] + } + }, + "jazlan01": { + "github": "jazlan01", + "name": "Muhammad Jazlan", + "teams": { + "2025": [ + "analysts", + "authors" + ] + } + }, "natedame": { "avatar_url": "67608345", "github": "natedame", diff --git a/src/content/en/2025/third-parties.md b/src/content/en/2025/third-parties.md index 6deb792779e..0a30297b786 100644 --- a/src/content/en/2025/third-parties.md +++ b/src/content/en/2025/third-parties.md @@ -3,18 +3,242 @@ title: Third Parties description: Third Parties chapter of the 2025 Web Almanac covering data of what third parties are used on the web and an analysis of inclusion chains of third parties. hero_alt: Hero image of Web Almanac characters plugging various things into a web page. -authors: [] -reviewers: [] -analysts: [] -editors: [] +authors: [jazlan01, abubakaraziz] +reviewers: [tunetheweb] +analysts: [jazlan01] +editors: [tunetheweb] translators: [] +jazlan01_bio: Muhammad Jazlan is a second year PhD student in Computer Science at University of California, Davis. His research focuses on the measurement, detection and mitigation of tracking on the web. +abubakaraziz_bio: Muhammad Abu Bakar Aziz is a PhD candidate in Computer Science at Northeastern University in Boston. His research focuses on web privacy. In particular, he empirically measures how third parties and online advertisers comply with privacy laws such as the CCPA and GDPR. results: https://docs.google.com/spreadsheets/d/1FPssodcLgX8iFWFXDrthWVkBCUTl5_IJon2cyaZVudU/edit -featured_quote: ... -featured_stat_1: ... -featured_stat_label_1: ... -featured_stat_2: ... -featured_stat_label_2: ... -featured_stat_3: ... -featured_stat_label_3: ... -doi: ... +featured_quote: The top 10 third-party domains are dominated by Google. +featured_stat_1: 90% +featured_stat_label_1: Pages with at least one third party +featured_stat_2: 16 +featured_stat_label_2: The median number of third-party domains present on a page +featured_stat_3: 18% +featured_stat_label_3: Percentage of websites that use TCF Standard +doi: TODO --- + +## Introduction + +Third parties are ubiquitous on the web. Website developers rely on them to implement key features such as advertising, analytics, social media integration, payment processing, and content delivery. This modular approach enables efficient and rapid deployment of rich functionality. However, it introduces potential privacy, security, and performance concerns. New this year, we analyze how user consent choices are propagated among third parties on the web, including the consent frameworks used and the third parties that receive these signals. + +In this chapter, we conduct an empirical analysis of third-party usage patterns on the web. We examine: + +- **Prevalence:** How many websites use third parties and in what proportions +- **Resource types:** The forms third parties take (images, JavaScript, fonts, etc.) +- **Functional categories:** Ad networks, analytics, CDNs, video providers, tag managers, and others +- **Integration methods:** How third parties are loaded directly or indirectly on pages +- **Consent infrastructure:** Which third parties transfer consent signals and how those transmissions happen in practice + +## Definitions + +First, we establish some definitions and terminology that are used throughout our analysis. + +### Sites and pages + +In this chapter, like previous years, we use the term site to depict the registerable part of a given domain which is often referred to as *extended Top Level Domain plus one* (eTLD+1). For example, given the URL `https://www.bar.com/` the eTLD+1 is `bar.com` and for the URL `https://foo.co.uk` the eTLD+1 is `foo.co.uk`. By page (or web page), we mean a unique URL or, more specifically, the document (for example HTML or JavaScript) located at the particular URL. + +### What is a third party? + +We stick to the definition of a third party used in previous editions of the Web Almanac to allow for comparison with earlier versions. + +A _third party_ is an entity different from the site owner (also known as the first party). It involves the aspects of the site not directly implemented and served by the site owner. More precisely, third-party content is loaded from a different site rather than the one originally visited by the user. Assume that the user visits `example.com` (the first party) and `example.com` includes silly cat images from `awesome-cats.edu` (for example using an `` tag). In that scenario, `awesome-cats.edu` is the third party, as it was not originally visited by the user. However, if the user directly visits `awesome-cats.edu`, `awesome-cats.edu` is the first party. + +For our analysis, only third parties originating from a domain whose resources can be found on at least five unique pages in the HTTP Archive dataset were included. + +When third-party content is directly served from a first party domain, it is counted as first party content. For example, self-hosted analytics scripts, CSS, or fonts are counted as first party content. Similarly, first-party content served from a third-party domain is counted as third-party content. Some third parties serve content from different subdomains. However, regardless of the number of subdomains, they are counted as a single third party. + +Further, it is becoming increasingly common for third parties to be masqueraded as a first party. Two key techniques enable this: + +- **CNAME cloaking** involves using a CNAME record to make a third party's content appear to come from the first party domain. We consider CNAME-cloaked services as first parties in this analysis. + +- **Server-side tracking** is an emerging trend where the site owner embeds the tracker as a first party and routes all requests through the first party domain, making the tracker appear as a first party. For example, a website `www.example.com` may embed server-side Google Tag Manager with Google Analytics and cloak the subdomain `sst.example.com` to send requests to a Google Tag Manager container. In this way, requests to third parties originate from the tag manager's server rather than the user's browser. + +In our analysis, we treat such cases as first party interactions because the third party communication occurs server-to-server and is not directly observable in the client-side HTTP Archive data. As a result, our measurements represent a lower bound on the actual prevalence of third parties on the web. + +## Categories + +As previously indicated, third parties can be used for various use cases—for example, to include videos, to serve ads, or to include content from social media sites. Similar to the previous year, to categorize the observed third parties in our dataset, we rely on the Third-Party Web repository from Patrick Hulce. The repository breaks down third parties along the following categories: + +- **Ad:** These scripts are part of advertising networks, either serving or measuring. +- **Analytics:** These scripts measure or track users and their actions. There’s a wide range of impact here, depending on what’s being tracked. +- **CDN:** These are a mixture of publicly hosted open source libraries (for example jQuery) served over different public CDNs and private CDN usage. +- **Content:** These scripts are from content providers or publishing-specific affiliate tracking. +- **Customer Success:** These scripts are from customer support/marketing providers that offer chat and contact solutions. These scripts are generally heavier in weight. +- **Hosting:** These scripts are from web hosting platforms (WordPress, Wix, Squarespace, etc.). +- **Marketing:** These scripts are from marketing tools that add popups/newsletters/etc. +- **Social:** These scripts enable social features. +- **Tag Manager:** These scripts tend to load many other scripts and initiate many tasks. +- **Utility:** These scripts are developer utilities (API clients, site monitoring, fraud detection, etc.). +- **Video:** These scripts enable video player and streaming functionality. +- **Consent provider:** These scripts allow sites to manage the user consent (e.g. for the [General Data Protection Regulation](https://wikipedia.org/wiki/General_Data_Protection_Regulation) compliance). They are also known as the ’Cookie Consent’ popups and are usually loaded on the critical path. +- **Other:** These are miscellaneous scripts delivered via a shared origin with no precise category or attribution. + +### `Content-Type` + +We use the [`Content-Type`](https://developer.mozilla.org/docs/Web/HTTP/Headers/Content-Type) HTTP header to categorize third-party resources into different types, such as scripts, HTML content, JSON data, plain text, and images. This allows us to analyze the composition of third-party resources served across websites. + +## Prevalence + +{{ figure_markup( + image="pages-using-at-least-one-3p.png", + caption="Percentage of pages that use one or more third parties.", + description="Bar chart showing percentage of pages across different rank groups that are using at least one third-party. Around 90%-92% pages use third-parties across different rank groups.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=249114645&format=interactive", + sheets_gid="1741089577", + sql_file="percent_of_websites_with_third_party_by_ranking.sql" + ) +}} + +Compared to [the previous year](../2024/third-parties#prevalence), we observe a slight decrease in the percentage of pages that use one or more third parties across websites. However, despite this decrease, the percentage of pages with one or more third parties remains greater or equal to 90%. + +{{ figure_markup( + image="num-3p-by-rank.png", + caption="Distribution of the number of third parties by rank.", + description="Bar chart showing distribution of number of third parties by rank groups. Number of third parties decrease with increasing rank groups.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=211745165&format=interactive", + sheets_gid="199539546", + sql_file="number_of_third_parties_by_rank.sql" + ) +}} + +Compared to the previous year, we observe a significant decrease in the median number of third-party domains across all website ranks, with a particularly large decrease among low-ranked websites. + +This decline may be due to several factors. First, third parties are increasingly obscured through `CNAME` cloaking and server-side tracking, which can reduce their visibility in client-side measurements. Second, HTTP Archive crawlers do not interact with web pages or scroll down the page, which may prevent some third parties from loading properly due to lazy loading. As a result, fewer third-party requests may be observed. + +We also observe that desktop pages generally include more third parties than mobile pages. + +{{ figure_markup( + image="num-3p-req-per-page-by-rank.png", + caption="Distribution of the number of third party requests per page by rank.", + description="Bar chart displaying the median number of third party requests per page by rank. Number of third party requests per page increases from top 1K to top 10K rank groups and then decreases for higher rank groups.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=1763082827&format=interactive", + sheets_gid="641162136", + sql_file="number_of_third_party_requests_per_page_by_rank.sql" + ) +}} + +Low-ranked websites load more third-party requests. The top 1,000 have a median of 129 requests on desktop and 106 on mobile, compared to 83 on desktop and 79 on mobile across all sites. + +Year-over-year, third-party requests have increased across all ranks. The top 1,000 sites show an increase of 15 requests on desktop and 15 on mobile [compared to 2024](../2024/third-parties#fig-3), while the broader dataset increased by five requests on desktop and five on mobile. This upward trend occurs despite the decrease in the number of unique third-party domains we observed earlier, suggesting that individual third parties are sending more requests per page. + +{{ figure_markup( + image="3p-req-categories-by-rank.png", + caption="Distribution of the third-party request categories by rank.", + description="Bar chart showing distribution of third-party categories by rank group. The top categories are ad, analytics, and cdn.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=1133634663&format=interactive", + sheets_gid="445864775", + sql_file="number_of_third_party_providers_by_rank_and_category.sql" + ) +}} + +The bar chart shows the median number of third-party providers per page by rank and category. In the previous edition, this analysis focused on the number of third-party domains per page by rank and category, whereas this year we measure the number of unique third-party providers, which results in lower counts overall. This year, the top categories are `ad`, `analytics`, and `cdn`. + +{{ figure_markup( + image="3p-req-types-by-rank.png", + caption="Distribution of the third-party request types by rank.", + description="Pie chart showing percentage distribution of third party requests by content type. The top 3 content types are `script` (24.8%), `image` (19.9%), and other (13.9%)", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=1309978891&format=interactive", + sheets_gid="418010554", + sql_file="percent_of_third_parties_by_content_type.sql" + ) +}} + +The chart shows that third-party requests are dominated by `script`, `image`, and the `other` category. Together, `script`, `image`, and `other` account for more than half of all third-party request content types. This pattern is consistent with [the 2024 edition](../2024/third-parties#fig-5), which also identified `script`, `image`, and `other` as the top request types, indicating little change since last year. + +{{ figure_markup( + image="top-3p-by-num-pages.png", + caption="Top third parties by the number of pages.", + description="Bar chart showing top third parties by the percentage of pages with their presence.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=194077318&format=interactive", + sheets_gid="803451847", + sql_file="top100_third_parties_by_number_of_websites.sql", + width=600, + height=498 + ) +}} + +The top 10 third-party domains are dominated by Google-owned services, including `fonts.googleapis.com`, `googletagmanager.com`, `google-analytics.com`, `accounts.google.com`, and `adservice.google.com`. Meta's `facebook.com` is the only non-Google domain in the top 10, appearing at rank 7 with 21% of pages. + +## Consent propagation among third parties + +In this section, we examine how different third parties transmit user consent across the web. Previous research has shown that third parties often rely on industry-standard frameworks to communicate consent information. In our analysis, we focus primarily on the IAB’s three consent standards: the Transparency and Consent Framework (TCF), the CCPA Framework, and the Global Privacy Protocol (GPP). + +These frameworks define how consent information is encoded and shared between websites and third parties. We begin by identifying which consent standards are most prevalent among the third parties observed in our dataset. To determine which framework a third party uses, we rely on the presence of specific parameters in the request URLs. Details of the different standards are below: + +- **TCF Standard**: We identify use of the TCF framework by checking whether a third-party request includes the `gdpr` or `gdpr_consent` parameters, as specified by the IAB TCF. + +- **GPP Standard**: We identify use of the GPP framework by checking for the presence of the `gpp` and `gpp_sid` parameters. + +- **USP Standard and non-USP Standard**: We identify use of the USP Standard by checking whether a request transmits a `us_privacy` parameter, as defined by the IAB CCPA Framework. We also identify use of the non-standard USP Standard by detecting consent strings transmitted via non-standard parameters identified in the prior work. + +We analyze consent signal prevalence across website ranks, third-party categories, and the most frequently observed consent-receiving third parties. + +### Prevalence of consent signals across different ranks + +{{ figure_markup( + image="consent-signal-prevalence-by-rank.png", + caption="Consent signal prevalence by rank.", + description="Bar chart showing the prevalence of different consent standards in third-party requests across website ranks.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=2066656520&format=interactive", + sheets_gid="1614774531", + sql_file="consent_signal_prevalence_by_third_party_category.sql" + ) +}} + +We find that TCF Standard is the dominant consent standard, particularly among low-ranked sites where it reaches 36% compared to 18% across all sites. This higher adoption aligns with stronger opt-in consent requirements under GDPR. The USP Standard is the second most prevalent, with adoption ranging from 9–17% across ranks. This reflects use of the IAB CCPA consent framework introduced in response to the CCPA. GPP adoption remains minimal at 3–6%, despite its goal to unify consent frameworks across jurisdictions. + +### Consent standard distribution across different categories + +{{ figure_markup( + image="consent-signal-prevalence-by-category.png", + caption="Consent signal prevalence by category.", + description="Bar chart showing consent standard prevalence across different third-party categories.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=828032137&format=interactive", + sheets_gid="1614774531", + sql_file="consent_signal_prevalence_by_third_party_category.sql" + ) +}} + +We observe different consent standard preferences across different third-party categories. For example, Social services show the highest TCF adoption, while advertising vendors employ a more balanced mix of GPP, USP Standard, and smaller TCF shares. Furthermore, Analytics vendors predominantly adopt GPP. + +### Top third parties receiving consent + +{{ figure_markup( + image="consent-signal-prevalence-by-domain.png", + caption="Consent signal prevalence by domain.", + description="Bar chart showing the third parties that receive the highest volume of consent signals.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=1262795614&format=interactive", + sheets_gid="1788947788", + sql_file="consent_signals_by_parameter_and_domain_optimized.sql" + ) +}} + +Among top-ranked websites, `pubmatic.com` receives the highest volume of consent signals, with `adservice.google.com` in second place. The majority of domains receiving the most consent signals are advertising and ad tech vendors—ad exchanges, DSPs, and ad servers. This makes intuitive sense, as in many jurisdictions third party advertising and analytics providers must obtain user consent before using user data for ads and other purposes. + +## Inclusion + +Recall from our earlier example that `example.com` (a first party) can include an image from `awesome-cats.edu` (a third party via an `` tag). This inclusion of an image would be considered direct inclusion. However, if the image was loaded by a third-party script on the site via the `XMLHttpRequest`, then the inclusion of the image would be considered indirect inclusion. The indirectly included third parties can further include additional third parties. For example, a third-party script that is directly included on the site may further include another third-party script. In this chapter, we do basic analysis of the depths of inclusion chains of the third parties. + +{{ figure_markup( + image="median-depth-tp-inclusion-chains.png", + caption="Median depth of third-party inclusion chains.", + description="Bar chart showing the median depth from inclusion chain.", + chart_url="https://docs.google.com/spreadsheets/d/e/2PACX-1vTrElluFB6gvlkt65HjzZMJ4PtgJ53tVnez46cBrhQNtNxUjDxvNPuS_xmlQBUmhSHZkOMAjd0bTJyr/pubchart?oid=692408075&format=interactive", + sheets_gid="1518420053", + sql_file="inclusion_chain.sql" + ) +}} + +The median depth of the inclusion chain is 3 which means the majority of the third parties include at least another third party on a web page. The maximum depth of the inclusion chain is 2,285. + +## Conclusion + +Our findings show the ubiquitous and increasingly concentrated nature of third parties on the web. More than nine-in-ten web pages include one or more third parties. While the median number of unique third-party domains has decreased compared to the previous year, we observe a significant increase in the total number of requests from third parties, suggesting individual vendors are sending more requests per page. + +In terms of consent standards, TCF is the dominant consent standard across all website ranks. Among individual third parties, `pubmatic.com`, `adservice.google.com` and other ad tech domains receive the highest volume of consent signals. + +Finally, the increasing use of obfuscation techniques such as CNAME cloaking and server-side tracking reduces visibility of third parties in client-side measurements, suggesting our findings represent a lower bound on actual prevalence. diff --git a/src/static/images/2025/third-parties/3p-req-categories-by-rank.png b/src/static/images/2025/third-parties/3p-req-categories-by-rank.png new file mode 100644 index 00000000000..258ec20732c Binary files /dev/null and b/src/static/images/2025/third-parties/3p-req-categories-by-rank.png differ diff --git a/src/static/images/2025/third-parties/3p-req-types-by-rank.png b/src/static/images/2025/third-parties/3p-req-types-by-rank.png new file mode 100644 index 00000000000..660b0f49043 Binary files /dev/null and b/src/static/images/2025/third-parties/3p-req-types-by-rank.png differ diff --git a/src/static/images/2025/third-parties/consent-signal-prevalence-by-category.png b/src/static/images/2025/third-parties/consent-signal-prevalence-by-category.png new file mode 100644 index 00000000000..65053651475 Binary files /dev/null and b/src/static/images/2025/third-parties/consent-signal-prevalence-by-category.png differ diff --git a/src/static/images/2025/third-parties/consent-signal-prevalence-by-domain.png b/src/static/images/2025/third-parties/consent-signal-prevalence-by-domain.png new file mode 100644 index 00000000000..b9a1081071a Binary files /dev/null and b/src/static/images/2025/third-parties/consent-signal-prevalence-by-domain.png differ diff --git a/src/static/images/2025/third-parties/consent-signal-prevalence-by-rank.png b/src/static/images/2025/third-parties/consent-signal-prevalence-by-rank.png new file mode 100644 index 00000000000..4d6dc7d97dc Binary files /dev/null and b/src/static/images/2025/third-parties/consent-signal-prevalence-by-rank.png differ diff --git a/src/static/images/2025/third-parties/median-depth-tp-inclusion-chains.png b/src/static/images/2025/third-parties/median-depth-tp-inclusion-chains.png new file mode 100644 index 00000000000..5754cbe2da2 Binary files /dev/null and b/src/static/images/2025/third-parties/median-depth-tp-inclusion-chains.png differ diff --git a/src/static/images/2025/third-parties/num-3p-by-rank.png b/src/static/images/2025/third-parties/num-3p-by-rank.png new file mode 100644 index 00000000000..3576e0beb94 Binary files /dev/null and b/src/static/images/2025/third-parties/num-3p-by-rank.png differ diff --git a/src/static/images/2025/third-parties/num-3p-req-per-page-by-rank.png b/src/static/images/2025/third-parties/num-3p-req-per-page-by-rank.png new file mode 100644 index 00000000000..d2349a403ca Binary files /dev/null and b/src/static/images/2025/third-parties/num-3p-req-per-page-by-rank.png differ diff --git a/src/static/images/2025/third-parties/pages-using-at-least-one-3p.png b/src/static/images/2025/third-parties/pages-using-at-least-one-3p.png new file mode 100644 index 00000000000..4cebf887ac2 Binary files /dev/null and b/src/static/images/2025/third-parties/pages-using-at-least-one-3p.png differ diff --git a/src/static/images/2025/third-parties/top-3p-by-num-pages.png b/src/static/images/2025/third-parties/top-3p-by-num-pages.png new file mode 100644 index 00000000000..3ce81c493df Binary files /dev/null and b/src/static/images/2025/third-parties/top-3p-by-num-pages.png differ