Skip to content

Commit abdde90

Browse files
committed
Most recently used queries
1 parent 87d7c8f commit abdde90

41 files changed

Lines changed: 79 additions & 78 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

sql/2025/third-parties/compressed_images_by_3p.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ third_party AS (
3131
requests r
3232
ON NET.HOST(r.url) = NET.HOST(tp.domain)
3333
WHERE
34-
date = '2025-06-01' AND
34+
date = '2025-07-01' AND
3535
category != 'hosting'
3636
GROUP BY
3737
domain

sql/2025/third-parties/consent_signal_basic_analysis.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WITH pages AS (
99
FROM
1010
`httparchive.crawl.pages`
1111
WHERE
12-
date = '2025-06-01'
12+
date = '2025-07-01'
1313
AND rank <= 50000 -- Expand to top 50K sites
1414
),
1515

@@ -38,7 +38,7 @@ consent_requests AS (
3838
ON
3939
r.client = p.client AND r.page = p.page
4040
WHERE
41-
r.date = '2025-06-01'
41+
r.date = '2025-07-01'
4242
AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only
4343
AND (
4444
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR

sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WITH pages AS (
99
FROM
1010
`httparchive.crawl.pages`
1111
WHERE
12-
date = '2025-06-01'
12+
date = '2025-07-01'
1313
),
1414

1515
requests AS (
@@ -20,7 +20,7 @@ requests AS (
2020
FROM
2121
`httparchive.crawl.requests`
2222
WHERE
23-
date = '2025-06-01'
23+
date = '2025-07-01'
2424
),
2525

2626
third_party AS (

sql/2025/third-parties/consent_signal_survival_rate_through_chains_optimized.sql

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
CREATE TEMP FUNCTION extractConsentSignals(url STRING)
55
RETURNS STRUCT<
66
has_usp_standard BOOL,
7-
has_usp_nonstandard BOOL,
7+
has_usp_nonstandard BOOL,
88
has_tcf_standard BOOL,
99
has_gpp_standard BOOL,
1010
has_any_signal BOOL
@@ -17,18 +17,18 @@ LANGUAGE js AS """
1717
has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url),
1818
has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url)
1919
};
20-
21-
signals.has_any_signal = signals.has_usp_standard ||
22-
signals.has_usp_nonstandard ||
23-
signals.has_tcf_standard ||
24-
signals.has_gpp_standard;
25-
20+
21+
signals.has_any_signal = signals.has_usp_standard ||
22+
signals.has_usp_nonstandard ||
23+
signals.has_tcf_standard ||
24+
signals.has_gpp_standard;
25+
2626
return signals;
2727
} catch (e) {
2828
return {
2929
has_usp_standard: false,
3030
has_usp_nonstandard: false,
31-
has_tcf_standard: false,
31+
has_tcf_standard: false,
3232
has_gpp_standard: false,
3333
has_any_signal: false
3434
};
@@ -43,7 +43,7 @@ WITH pages AS (
4343
FROM
4444
`httparchive.crawl.pages`
4545
WHERE
46-
date = '2025-06-01'
46+
date = '2025-07-01'
4747
AND rank <= 10000 -- Aggressive filtering: top 10K only
4848
),
4949

@@ -64,7 +64,7 @@ filtered_requests AS (
6464
ON
6565
r.client = p.client AND r.page = p.page
6666
WHERE
67-
r.date = '2025-06-01'
67+
r.date = '2025-07-01'
6868
AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only
6969
AND (
7070
-- Only process requests with consent signals OR that are part of chains
@@ -124,13 +124,13 @@ step_1_stats AS (
124124
SELECT
125125
client,
126126
1 AS step_number,
127-
127+
128128
COUNTIF(consent_signals.has_usp_standard) AS usp_standard_count,
129129
COUNTIF(consent_signals.has_usp_nonstandard) AS usp_nonstandard_count,
130130
COUNTIF(consent_signals.has_tcf_standard) AS tcf_standard_count,
131131
COUNTIF(consent_signals.has_gpp_standard) AS gpp_standard_count,
132132
COUNTIF(consent_signals.has_any_signal) AS any_signal_count,
133-
133+
134134
COUNT(*) AS total_requests,
135135
COUNT(DISTINCT root_page) AS total_pages
136136
FROM
@@ -143,13 +143,13 @@ step_2_stats AS (
143143
SELECT
144144
client,
145145
2 AS step_number,
146-
146+
147147
COUNTIF(step2_signals.has_usp_standard) AS usp_standard_count,
148148
COUNTIF(step2_signals.has_usp_nonstandard) AS usp_nonstandard_count,
149149
COUNTIF(step2_signals.has_tcf_standard) AS tcf_standard_count,
150150
COUNTIF(step2_signals.has_gpp_standard) AS gpp_standard_count,
151151
COUNTIF(step2_signals.has_any_signal) AS any_signal_count,
152-
152+
153153
COUNT(*) AS total_requests,
154154
COUNT(DISTINCT root_page) AS total_pages
155155
FROM
@@ -186,20 +186,20 @@ SELECT
186186
cs.step_number,
187187
cs.total_requests,
188188
cs.total_pages,
189-
189+
190190
-- Signal counts and survival rates
191191
cs.usp_standard_count,
192192
SAFE_DIVIDE(cs.usp_standard_count, b.usp_standard_baseline) AS usp_standard_survival_rate,
193-
193+
194194
cs.usp_nonstandard_count,
195195
SAFE_DIVIDE(cs.usp_nonstandard_count, b.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate,
196-
196+
197197
cs.tcf_standard_count,
198198
SAFE_DIVIDE(cs.tcf_standard_count, b.tcf_standard_baseline) AS tcf_standard_survival_rate,
199-
199+
200200
cs.gpp_standard_count,
201201
SAFE_DIVIDE(cs.gpp_standard_count, b.gpp_standard_baseline) AS gpp_standard_survival_rate,
202-
202+
203203
cs.any_signal_count,
204204
SAFE_DIVIDE(cs.any_signal_count, b.any_signal_baseline) AS any_signal_survival_rate
205205

sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ WITH pages AS (
4949
FROM
5050
`httparchive.crawl.pages`
5151
WHERE
52-
date = '2025-06-01'
52+
date = '2025-07-01'
5353
),
5454

5555
-- Get redirect chains from crawl.requests summary column
@@ -69,7 +69,7 @@ redirect_chains AS (
6969
ON
7070
r.client = p.client AND r.page = p.page
7171
WHERE
72-
r.date = '2025-06-01'
72+
r.date = '2025-07-01'
7373
AND JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL
7474
AND JSON_EXTRACT(r.summary, '$.redirects') != '[]'
7575
-- AND p.rank <= 100000 -- Limit to top 100K sites

sql/2025/third-parties/consent_signal_survival_rate_through_redirects_optimized.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ WITH pages AS (
4949
FROM
5050
`httparchive.crawl.pages`
5151
WHERE
52-
date = '2025-06-01'
52+
date = '2025-07-01'
5353
AND rank <= 100000 -- Expanded to top 100K sites
5454
),
5555

@@ -68,7 +68,7 @@ requests_with_redirects AS (
6868
ON
6969
r.client = p.client AND r.page = p.page
7070
WHERE
71-
r.date = '2025-06-01'
71+
r.date = '2025-07-01'
7272
AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only
7373
AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') IS NOT NULL
7474
AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') != ''

sql/2025/third-parties/consent_signal_survival_rate_through_redirects_working.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ WITH pages AS (
4949
FROM
5050
`httparchive.crawl.pages`
5151
WHERE
52-
date = '2025-06-01'
52+
date = '2025-07-01'
5353
AND rank <= 100000 -- Expanded to top 100K sites
5454
),
5555

@@ -67,7 +67,7 @@ initial_consent_requests AS (
6767
ON
6868
r.client = p.client AND r.page = p.page
6969
WHERE
70-
r.date = '2025-06-01'
70+
r.date = '2025-07-01'
7171
AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only
7272
AND REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=')
7373
),
@@ -90,7 +90,7 @@ requests_with_redirects AS (
9090
AND icr.page = r.page
9191
AND icr.url = r.url
9292
WHERE
93-
r.date = '2025-06-01'
93+
r.date = '2025-07-01'
9494
AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') IS NOT NULL
9595
AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') != ''
9696
),

sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WITH pages AS (
99
FROM
1010
`httparchive.crawl.pages`
1111
WHERE
12-
date = '2025-06-01'
12+
date = '2025-07-01'
1313
),
1414

1515
requests AS (
@@ -20,7 +20,7 @@ requests AS (
2020
FROM
2121
`httparchive.crawl.requests`
2222
WHERE
23-
date = '2025-06-01'
23+
date = '2025-07-01'
2424
-- Pre-filter: only process URLs that contain consent-related parameters
2525
AND REGEXP_CONTAINS(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=')
2626
),
@@ -122,7 +122,7 @@ totals AS (
122122
NET.HOST(r.url) = NET.HOST(tp.domain),
123123
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
124124
WHERE
125-
r.date = '2025-06-01'
125+
r.date = '2025-07-01'
126126
AND p.rank <= rank_grouping
127127
GROUP BY
128128
r.client,

sql/2025/third-parties/content_encoding.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ third_party AS (
2121
requests r
2222
ON NET.HOST(r.url) = NET.HOST(tp.domain)
2323
WHERE
24-
date = '2025-06-01' AND
24+
date = '2025-07-01' AND
2525
category != 'hosting'
2626
GROUP BY
2727
domain

sql/2025/third-parties/content_encoding_by_content_type.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ third_party AS (
2222
requests r
2323
ON NET.HOST(r.url) = NET.HOST(tp.domain)
2424
WHERE
25-
date = '2025-06-01' AND
25+
date = '2025-07-01' AND
2626
category != 'hosting'
2727
GROUP BY
2828
domain

0 commit comments

Comments
 (0)