-
-
Notifications
You must be signed in to change notification settings - Fork 209
Expand file tree
/
Copy pathtracker_distribution.sql
More file actions
54 lines (50 loc) · 1.5 KB
/
tracker_distribution.sql
File metadata and controls
54 lines (50 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
-- noqa: disable=PRS
-- Number of websites that deploy a certain number of trackers
WITH base_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_websites
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
--AND rank = 1000
GROUP BY client
),
whotracksme AS (
SELECT
NET.HOST(domain) AS domain,
tracker
FROM `httparchive.almanac.whotracksme`
WHERE date = '2025-07-01'
AND category IN ('advertising', 'pornvertising', 'site_analytics', 'social_media')
),
tracker_counts AS (
SELECT
client,
root_page,
COUNT(DISTINCT tracker) AS number_of_trackers
FROM `httparchive.crawl.requests`
LEFT JOIN whotracksme
ON
NET.HOST(url) = domain OR
ENDS_WITH(NET.HOST(url), CONCAT('.', domain))
WHERE
date = '2025-07-01'
--AND rank = 1000
AND url NOT IN ('https://android.clients.google.com/checkin', 'https://android.clients.google.com/c2dm/register3')
GROUP BY
client,
root_page
)
FROM tracker_counts
|> AGGREGATE COUNT(DISTINCT root_page) AS number_of_websites GROUP BY client, number_of_trackers
|> EXTEND SUM(number_of_websites) OVER (PARTITION BY client ORDER BY number_of_trackers DESC) AS ccdf_websites
|> JOIN base_totals USING (client)
|> EXTEND ccdf_websites / total_websites AS ccdf
|> DROP total_websites, number_of_websites
|> PIVOT(
ANY_VALUE(ccdf_websites) AS websites_count,
ANY_VALUE(ccdf) AS ccdf
FOR client IN ('desktop', 'mobile')
)
|> RENAME ccdf_mobile AS Mobile, ccdf_desktop AS Desktop
|> ORDER BY number_of_trackers