Skip to content

Commit c14742b

Browse files
Merge branch 'main' into privacy-sql-2025
2 parents 2b523a9 + 28299ac commit c14742b

33 files changed

Lines changed: 1948 additions & 47 deletions

sql/2025/security/cookie_age_percentiles.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ LANGUAGE js AS '''
3737
WITH age_values AS (
3838
SELECT
3939
client,
40-
getCookieAgeValues(response_headers.value, INT64(summary.startedDateTime)) AS values
40+
getCookieAgeValues(response_headers.value, UNIX_SECONDS(TIMESTAMP(STRING(payload.startedDateTime)))) AS values
4141
FROM
4242
`httparchive.crawl.requests`,
4343
UNNEST(response_headers) AS response_headers

sql/2025/security/cookie_max_age_expires_top_values.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ WITH max_age_values AS (
3434
FROM
3535
`httparchive.crawl.requests`,
3636
UNNEST(response_headers) AS rh,
37-
UNNEST(JSON_QUERY_ARRAY(getCookieAgeValues(rh.value, INT64(summary.startedDateTime))), '$.maxAge') AS max_age_value
37+
UNNEST(JSON_QUERY_ARRAY(getCookieAgeValues(rh.value, INT64(summary.startedDateTime)), '$.maxAge')) AS max_age_value
3838
WHERE
3939
date = '2025-07-01' AND
4040
is_root_page AND
@@ -46,7 +46,7 @@ expires_values AS (
4646
client,
4747
expires_value
4848
FROM
49-
`httparchive.all.requests`,
49+
`httparchive.crawl.requests`,
5050
UNNEST(response_headers) AS rh,
5151
UNNEST(JSON_QUERY_ARRAY(getCookieAgeValues(rh.value, INT64(summary.startedDateTime)), '$.expires')) AS expires_value
5252
WHERE

sql/2025/security/csp_script_source_list_keywords_per_request.sql

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,19 @@ FROM (
2424
SELECT
2525
client,
2626
COUNT(0) AS total_pages_with_csp,
27-
COUNTIF(csp_header IS NOT NULL) AS freq_csp,
28-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src')) AS freq_default_script_src,
29-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+strict-dynamic')) AS freq_strict_dynamic,
30-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+nonce-')) AS freq_nonce,
31-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+unsafe-inline')) AS freq_script_unsafe_inline,
32-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+unsafe-eval')) AS freq_script_unsafe_eval,
33-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)unsafe-inline')) AS freq_unsafe_inline,
34-
COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)unsafe-eval')) AS freq_unsafe_eval
27+
COUNTIF(csp_combined IS NOT NULL) AS freq_csp,
28+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)(default|script)-src')) AS freq_default_script_src,
29+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)(default|script)-src[^;]+strict-dynamic')) AS freq_strict_dynamic,
30+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)(default|script)-src[^;]+nonce-')) AS freq_nonce,
31+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)(default|script)-src[^;]+unsafe-inline')) AS freq_script_unsafe_inline,
32+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)(default|script)-src[^;]+unsafe-eval')) AS freq_script_unsafe_eval,
33+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)unsafe-inline')) AS freq_unsafe_inline,
34+
COUNTIF(REGEXP_CONTAINS(csp_combined, '(?i)unsafe-eval')) AS freq_unsafe_eval
3535
FROM (
3636
SELECT
3737
client,
38-
response_headers.value AS csp_header
38+
url,
39+
STRING_AGG(response_headers.value, '; ') AS csp_combined
3940
FROM
4041
`httparchive.crawl.requests`,
4142
UNNEST(response_headers) AS response_headers
@@ -44,6 +45,8 @@ FROM (
4445
is_root_page AND
4546
is_main_document AND
4647
LOWER(response_headers.name) = 'content-security-policy'
48+
GROUP BY
49+
client, url
4750
)
4851
GROUP BY
4952
client

sql/2025/security/hsts_attributes.sql

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,25 @@
33
# Question: How many websites use HSTS includeSubDomains and preload?
44
SELECT
55
client,
6-
COUNT(0) AS total_requests,
7-
COUNTIF(hsts_header_val IS NOT NULL) AS total_hsts_headers,
8-
COUNTIF(hsts_header_val IS NOT NULL) / COUNT(0) AS pct_hsts_requests,
9-
COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)max-age\s*=\s*\d+') AND NOT REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_valid_max_age,
10-
COUNTIF(REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_zero_max_age,
11-
COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)includeSubDomains')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_include_subdomains,
12-
COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)preload')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_preload
6+
COUNT(0) AS total_requests_with_hsts_header,
7+
COUNTIF(hsts_header_val IS NOT NULL) AS total_non_null_hsts_headers,
8+
SAFE_DIVIDE(COUNTIF(hsts_header_val IS NOT NULL), COUNT(0)) AS pct_hsts_requests,
9+
SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)max-age\s*=\s*\d+') AND NOT REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')), COUNTIF(hsts_header_val IS NOT NULL)) AS pct_valid_max_age,
10+
SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')), COUNTIF(hsts_header_val IS NOT NULL)) AS pct_zero_max_age,
11+
SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)includeSubDomains')), COUNTIF(hsts_header_val IS NOT NULL)) AS pct_include_subdomains,
12+
SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)preload')), COUNTIF(hsts_header_val IS NOT NULL)) AS pct_preload
1313
FROM (
1414
SELECT
1515
client,
16-
REGEXP_EXTRACT(summary.respOtherHeaders, r'(?i)strict-transport-security =([^,]+)') AS hsts_header_val
16+
response_headers.value AS hsts_header_val
1717
FROM
18-
`httparchive.crawl.requests`
18+
`httparchive.crawl.requests`,
19+
UNNEST(response_headers) AS response_headers
1920
WHERE
2021
date = '2025-07-01' AND
2122
is_root_page AND
22-
is_main_document
23+
is_main_document AND
24+
LOWER(response_headers.name) = 'strict-transport-security'
2325
)
2426
GROUP BY
2527
client

sql/2025/security/https_server_redirects.sql

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ SELECT
88
COUNT(DISTINCT url) AS total_urls_on_page,
99
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) AS count_http_urls_on_page,
1010
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) / COUNT(DISTINCT url) AS pct_http_urls_on_page,
11-
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND STRING(summary.resp_location) LIKE 'https://%' AND INT64(summary.status) BETWEEN 300 AND 399 THEN url END)) AS count_http_urls_with_https_redirect_on_page,
12-
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND STRING(summary.resp_location) LIKE 'https://%' AND INT64(summary.status) BETWEEN 300 AND 399 THEN url END)) / COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) AS pct_http_urls_with_https_redirect_on_page
11+
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND (SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'location' LIMIT 1) LIKE 'https://%' AND INT64(summary.status) BETWEEN 300 AND 399 THEN url END)) AS count_http_urls_with_https_redirect_on_page, -- noqa: AM09
12+
COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND (SELECT value FROM UNNEST(response_headers) WHERE LOWER(name) = 'location' LIMIT 1) LIKE 'https://%' AND INT64(summary.status) BETWEEN 300 AND 399 THEN url END)) / COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) AS pct_http_urls_with_https_redirect_on_page -- noqa: AM09
1313
FROM
14-
`httparchive.crawl.requests`
14+
`httparchive.crawl.requests`,
15+
UNNEST(response_headers) AS response_headers
1516
WHERE
1617
date = '2025-07-01' AND
1718
is_root_page

sql/2025/security/robot_txt_sensitive_disallow.sql

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,10 @@
44
CREATE TEMPORARY FUNCTION getAllDisallowedEndpoints(data JSON)
55
RETURNS ARRAY<STRING> DETERMINISTIC
66
LANGUAGE js AS '''
7-
let parsed_data;
8-
try {
9-
parsed_data = JSON.parse(data);
10-
} catch (e) {
7+
if (data == null || data["/robots.txt"] == undefined || !data["/robots.txt"]["found"]) {
118
return [];
129
}
13-
if (parsed_data == null || parsed_data["/robots.txt"] == undefined || !parsed_data["/robots.txt"]["found"]) {
14-
return [];
15-
}
16-
const parsed_endpoints = parsed_data["/robots.txt"]["data"]["matched_disallows"];
10+
const parsed_endpoints = data["/robots.txt"]["data"]["matched_disallows"];
1711
const endpoints_list = Object.keys(parsed_endpoints).map(key => parsed_endpoints[key]).flat();
1812
return Array.from(new Set(endpoints_list));
1913
''';

sql/2025/security/sri_coverage_per_page.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#standardSQL
22
# Section: Content Inclusion - Subresource Integriy
33
# Question: How many scripts on a page have the integrity attribute? (percentage)
4-
CREATE TEMP FUNCTION getNumScriptElements(sris ARRAY<STRING>) AS (
4+
CREATE TEMP FUNCTION getNumScriptElements(sris ARRAY<JSON>) AS (
55
(SELECT COUNT(0) FROM UNNEST(sris) AS sri WHERE JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script')
66
);
77

sql/2025/security/tls_ca_issuers_pages_over_time.sql

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Question: What is the distribution of CA issuers for all pages over time?
44
# Note: currently includes HTTP (i.e., pages with no issuer)
55
SELECT
6+
date,
67
client,
78
issuer,
89
SUM(COUNT(0)) OVER (PARTITION BY client, date) AS total_https_pages,
@@ -23,10 +24,12 @@ FROM (
2324
GROUP BY
2425
client,
2526
request_host,
26-
issuer
27+
issuer,
28+
date
2729
)
2830
GROUP BY
2931
client,
30-
issuer
32+
issuer,
33+
date
3134
ORDER BY
32-
pct DESC
35+
date DESC
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#standardSQL
2+
# The distribution of cache header adoption on websites by client.
3+
4+
SELECT
5+
client,
6+
COUNT(0) AS total_requests,
7+
8+
COUNTIF(uses_cache_control) AS total_using_cache_control,
9+
COUNTIF(uses_max_age) AS total_using_max_age,
10+
COUNTIF(uses_expires) AS total_using_expires,
11+
COUNTIF(uses_max_age AND uses_expires) AS total_using_max_age_and_expires,
12+
COUNTIF(
13+
uses_cache_control AND uses_expires
14+
) AS total_using_both_cc_and_expires,
15+
COUNTIF(
16+
NOT uses_cache_control AND NOT uses_expires
17+
) AS total_using_neither_cc_and_expires,
18+
COUNTIF(
19+
uses_cache_control AND NOT uses_expires
20+
) AS total_using_only_cache_control,
21+
COUNTIF(
22+
NOT uses_cache_control AND uses_expires
23+
) AS total_using_only_expires,
24+
25+
COUNTIF(uses_cache_control) / COUNT(0) AS pct_cache_control,
26+
COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age,
27+
COUNTIF(uses_expires) / COUNT(0) AS pct_using_expires,
28+
COUNTIF(
29+
uses_max_age AND uses_expires
30+
) / COUNT(0) AS pct_using_max_age_and_expires,
31+
COUNTIF(
32+
uses_cache_control AND uses_expires
33+
) / COUNT(0) AS pct_using_both_cc_and_expires,
34+
COUNTIF(
35+
NOT uses_cache_control AND NOT uses_expires
36+
) / COUNT(0) AS pct_using_neither_cc_nor_expires,
37+
COUNTIF(
38+
uses_cache_control AND NOT uses_expires
39+
) / COUNT(0) AS pct_using_only_cache_control,
40+
COUNTIF(
41+
NOT uses_cache_control AND uses_expires
42+
) / COUNT(0) AS pct_using_only_expires
43+
44+
FROM (
45+
SELECT
46+
client,
47+
url,
48+
LOGICAL_OR(
49+
header.name = 'expires' AND header.value IS NOT NULL AND TRIM(
50+
header.value
51+
) != ''
52+
) AS uses_expires,
53+
LOGICAL_OR(
54+
header.name = 'cache-control' AND
55+
header.value IS NOT NULL AND
56+
TRIM(header.value) != ''
57+
) AS uses_cache_control,
58+
LOGICAL_OR(
59+
header.name = 'cache-control' AND REGEXP_CONTAINS(
60+
header.value, r'(?i)max-age\s*=\s*[0-9]+'
61+
)
62+
) AS uses_max_age,
63+
64+
LOGICAL_OR(
65+
header.name = 'etag' AND (
66+
header.value IS NULL OR TRIM(header.value) = ''
67+
)
68+
) AS uses_no_etag,
69+
LOGICAL_OR(
70+
header.name = 'etag' AND header.value IS NOT NULL AND TRIM(
71+
header.value
72+
) != ''
73+
) AS uses_etag,
74+
LOGICAL_OR(
75+
header.name = 'last-modified' AND
76+
header.value IS NOT NULL AND
77+
TRIM(header.value) != ''
78+
) AS uses_last_modified,
79+
80+
LOGICAL_OR(
81+
header.name = 'etag' AND REGEXP_CONTAINS(
82+
TRIM(header.value), '^W/".*"'
83+
)
84+
) AS uses_weak_etag,
85+
LOGICAL_OR(
86+
header.name = 'etag' AND REGEXP_CONTAINS(
87+
TRIM(header.value), '^".*"'
88+
)
89+
) AS uses_strong_etag
90+
91+
FROM
92+
`httparchive.crawl.requests`,
93+
UNNEST(response_headers) AS header
94+
WHERE
95+
date = '2025-07-01'
96+
GROUP BY
97+
client,
98+
url
99+
)
100+
101+
GROUP BY
102+
client
103+
ORDER BY
104+
client;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#standardSQL
2+
# The distribution of CDN adoption on websites by client.
3+
4+
SELECT
5+
client,
6+
total,
7+
IF(cdn = '', 'No CDN', cdn) AS cdn,
8+
COUNT(0) AS freq,
9+
ROUND(100 * COUNT(0) / total, 2) AS pct
10+
FROM (
11+
SELECT
12+
client,
13+
COUNT(0) AS total,
14+
ARRAY_CONCAT_AGG(
15+
SPLIT(JSON_VALUE(summary.cdn), ', ')
16+
) AS cdn_list
17+
FROM
18+
`httparchive.crawl.pages`
19+
WHERE
20+
date = '2025-07-01' AND
21+
is_root_page = TRUE
22+
GROUP BY
23+
client
24+
),
25+
UNNEST(cdn_list) AS cdn
26+
GROUP BY
27+
client,
28+
cdn,
29+
total
30+
ORDER BY
31+
pct DESC,
32+
client ASC,
33+
cdn ASC;

0 commit comments

Comments
 (0)