Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
dae7f2a
Initialized with queries from 2024
jazlan01 Aug 27, 2025
771fd5f
Updated query for 2025 spec
jazlan01 Aug 27, 2025
0f0ed90
Updated date
jazlan01 Aug 27, 2025
a9da756
Temp changes to dates, 2025-06 instead of 2025-07
jazlan01 Aug 28, 2025
ee7b00e
Updated median number of third parties by rank query
jazlan01 Aug 28, 2025
bde3438
Updated top 100 third parties by number of websites query
jazlan01 Aug 28, 2025
a0c17f7
Updated third party domains per page by rank
jazlan01 Aug 28, 2025
3c8519d
Updated percent of third parties by content type
jazlan01 Aug 28, 2025
8976952
Added prevalence of consent signals in third party requests
jazlan01 Aug 28, 2025
9a9f908
Updated queries for 2025
jazlan01 Aug 28, 2025
beff2c6
Updated third parties table date
jazlan01 Sep 15, 2025
87d7c8f
Updated third-parties.md with the content
jazlan01 Dec 27, 2025
1e302cc
Merge branch 'main' into thirdparties-sql-2025
jazlan01 Dec 31, 2025
374bad3
Merge branch 'main' into thirdparties-sql-2025
jazlan01 Jan 1, 2026
8139992
Enable chapter
tunetheweb Jan 3, 2026
59fef6d
Technical edit
tunetheweb Jan 3, 2026
d30ed0d
Lint SQL
tunetheweb Jan 3, 2026
c859d0b
Retake image
tunetheweb Jan 3, 2026
a772fd7
tunetheweb contributions
tunetheweb Jan 3, 2026
3f552ed
Linting fixes
tunetheweb Jan 3, 2026
b147e23
Linting
tunetheweb Jan 3, 2026
a7d2d17
Tweaks
tunetheweb Jan 3, 2026
1e346c4
Apply suggestions from code review
tunetheweb Jan 3, 2026
ae9218c
Update src/content/en/2025/third-parties.md
tunetheweb Jan 3, 2026
4f9ecd4
Update contributors
tunetheweb Jan 3, 2026
656d60d
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 3, 2026
1999788
Clean up images
tunetheweb Jan 3, 2026
4e9ca4f
Merge remote-tracking branch 'upstream/main' into thirdparties-sql-2025
tunetheweb Jan 6, 2026
c4ff7f8
Merge remote-tracking branch 'upstream/main' into thirdparties-sql-2025
tunetheweb Jan 6, 2026
f2a0bbb
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 9, 2026
e12b85c
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 9, 2026
6c025be
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 9, 2026
537efc5
Merge branch 'main' into thirdparties-sql-2025
jazlan01 Jan 10, 2026
abdde90
Most recently used queries
jazlan01 Jan 10, 2026
e5d36d4
Merge branch 'thirdparties-sql-2025' of github.com:jazlan01/almanac.h…
jazlan01 Jan 10, 2026
e5592e5
Updated bio for Jazlan
jazlan01 Jan 10, 2026
cfe46a8
Added featured stats, sql file names
jazlan01 Jan 10, 2026
62bb93b
Fixed linter errors
jazlan01 Jan 11, 2026
0cd4ce5
text update
abubakaraziz Jan 11, 2026
217f2ea
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 11, 2026
2bad4b3
Technical edit
tunetheweb Jan 11, 2026
5c202c1
Cleaned up SQL
jazlan01 Jan 11, 2026
750602c
Fixed sql file name for third party providers
jazlan01 Jan 11, 2026
a794c8b
Added newline at the end of the file
jazlan01 Jan 11, 2026
8081630
Update sql/2025/third-parties/number_of_third_parties_by_rank_and_cat…
tunetheweb Jan 11, 2026
910fd55
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 11, 2026
1187992
Update sql/2025/third-parties/number_of_third_parties_by_rank_and_cat…
tunetheweb Jan 11, 2026
2781079
Merge branch 'main' into thirdparties-sql-2025
jazlan01 Jan 11, 2026
1ba5085
Merge branch 'main' into thirdparties-sql-2025
tunetheweb Jan 11, 2026
9d6616b
Incorporate feedback in text
abubakaraziz Jan 11, 2026
0838649
Update src/content/en/2025/third-parties.md with link to 2024 chapter
jazlan01 Jan 11, 2026
46161b7
Merge branch 'main' into thirdparties-sql-2025
jazlan01 Jan 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions sql/2025/third-parties/consent_signal_basic_analysis.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#standardSQL
# Basic consent signal analysis (simplified version to ensure data returns)
Comment thread
tunetheweb marked this conversation as resolved.

WITH pages AS (
SELECT
client,
page,
rank
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND
rank <= 50000 -- Expand to top 50K sites
Comment thread
jazlan01 marked this conversation as resolved.
),

-- Find requests with consent signals (no redirect filtering)
consent_requests AS (
SELECT
r.client,
r.page,
r.url,
NET.REG_DOMAIN(r.page) AS page_domain,
NET.REG_DOMAIN(r.url) AS url_domain,

-- Extract consent signals
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard,
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') AS has_usp_nonstandard,
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard,
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard,

-- Check if request has redirects
JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL AND
TO_JSON_STRING(JSON_EXTRACT(r.summary, '$.redirects')) != '[]' AS has_redirects
FROM
`httparchive.crawl.requests` r
INNER JOIN
pages p
ON
r.client = p.client AND r.page = p.page
WHERE
r.date = '2025-07-01' AND
NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) AND -- Third-party only
(
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') OR
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') OR
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=')
)
),

-- Add any consent signal flag
requests_with_signals AS (
SELECT
*,
(has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_signal
FROM
consent_requests
)

-- Basic analysis
SELECT
client,

-- Overall counts
COUNT(0) AS total_requests_with_consent_signals,
COUNT(DISTINCT page) AS total_pages_with_consent_signals,
COUNT(DISTINCT url_domain) AS total_domains_with_consent_signals,

-- Signal type breakdown
COUNTIF(has_usp_standard) AS usp_standard_requests,
COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests,
COUNTIF(has_tcf_standard) AS tcf_standard_requests,
COUNTIF(has_gpp_standard) AS gpp_standard_requests,

-- Percentage breakdown
COUNTIF(has_usp_standard) / COUNT(0) AS pct_usp_standard,
COUNTIF(has_usp_nonstandard) / COUNT(0) AS pct_usp_nonstandard,
COUNTIF(has_tcf_standard) / COUNT(0) AS pct_tcf_standard,
COUNTIF(has_gpp_standard) / COUNT(0) AS pct_gpp_standard,

-- Redirect availability
COUNTIF(has_redirects) AS requests_with_redirects,
COUNTIF(has_redirects) / COUNT(0) AS pct_requests_with_redirects

FROM
requests_with_signals
GROUP BY
client
ORDER BY
client
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#standardSQL
# Consent signal prevalence broken down by third-party category

WITH pages AS (
SELECT
client,
page,
rank
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-07-01'
),

requests AS (
SELECT
client,
page,
url
FROM
`httparchive.crawl.requests`
WHERE
date = '2025-07-01'
),

third_party AS (
SELECT
domain,
canonicalDomain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2025-07-01' AND
category != 'hosting'
GROUP BY
domain,
canonicalDomain,
category
HAVING
page_usage >= 50
),

-- Get total requests per category and rank grouping for percentage calculations
category_totals AS (
SELECT
r.client,
rank_grouping,
tp.category,
COUNT(0) AS total_category_requests,
COUNT(DISTINCT r.page) AS total_category_pages,
COUNT(DISTINCT tp.canonicalDomain) AS total_category_domains
FROM
requests r
INNER JOIN
pages p
ON
r.client = p.client AND r.page = p.page
INNER JOIN
third_party tp
ON
NET.HOST(r.url) = NET.HOST(tp.domain),
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
p.rank <= rank_grouping
GROUP BY
r.client,
rank_grouping,
tp.category
),

-- Extract consent signals from third-party requests
consent_signals_by_category AS (
SELECT
r.client,
rank_grouping,
tp.category,
tp.canonicalDomain,
r.page,
r.url,

-- Single-pass consent signal detection
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard,
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard,
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard,
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard

FROM
requests r
INNER JOIN
pages p
ON
r.client = p.client AND r.page = p.page
INNER JOIN
third_party tp
ON
NET.HOST(r.url) = NET.HOST(tp.domain),
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
p.rank <= rank_grouping AND
-- Pre-filter: only process URLs that might contain consent-related parameters
REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=')
),

-- Add computed flag for any consent signal
signals_with_any AS (
SELECT
*,
(has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal
FROM
consent_signals_by_category
),

-- Aggregate consent signals by category
category_signal_aggregates AS (
SELECT
client,
rank_grouping,
category,

-- USP Standard metrics
COUNTIF(has_usp_standard) AS usp_standard_requests,
COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages,
COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains,

-- USP Non-Standard metrics
COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests,
COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages,
COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains,

-- TCF Standard metrics
COUNTIF(has_tcf_standard) AS tcf_standard_requests,
COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages,
COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains,

-- GPP Standard metrics
COUNTIF(has_gpp_standard) AS gpp_standard_requests,
COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages,
COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains,

-- Any consent signal metrics
COUNTIF(has_any_consent_signal) AS any_consent_requests,
COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages,
COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains,

-- Totals for this filtered dataset
COUNT(0) AS total_filtered_requests
FROM
signals_with_any
GROUP BY
client,
rank_grouping,
category
)

-- Final output using UNNEST to avoid repetitive UNION ALL
SELECT
agg.client,
agg.rank_grouping,
agg.category,
signal_data.signal_type,
signal_data.requests_with_signal,
totals.total_category_requests,
signal_data.requests_with_signal / totals.total_category_requests AS pct_requests_with_signal,
signal_data.pages_with_signal,
totals.total_category_pages,
signal_data.pages_with_signal / totals.total_category_pages AS pct_pages_with_signal,
signal_data.domains_with_signal,
totals.total_category_domains,
signal_data.domains_with_signal / totals.total_category_domains AS pct_domains_with_signal
FROM
category_signal_aggregates agg
JOIN
category_totals totals
USING (client, rank_grouping, category)
CROSS JOIN
UNNEST([
STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal),
STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal),
STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal),
STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal),
STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal)
]) AS signal_data
WHERE
signal_data.requests_with_signal > 0 -- Only show categories with consent signals

ORDER BY
client,
rank_grouping,
category,
signal_type
Loading