Skip to content

Commit e215418

Browse files
vsdaanvikvanderlindentunetheweb
authored
Security sql 2025 (#4181)
* copied 2024 queries * Added partitioned attribute to query * Updated queries to use 2025 data * Rewrote documentdomain usage to use crawl.pages * Rewrote feature adaption queries to use response headers field * Changed all except features and iframes to crawl database * Updated feature adoption queries to match new db structure * Updated iframe queries to match new db structure * CA issuer over time query added * Added expired cert query * Updated CSP queries in line with issue 4036 * Added Audit issues query * Removed unused query, added browser log queries * Added document-policy-report-only * Added doc policy to more secheader queries * Linting * Updated json selectors * Better JSON selectors, removed some subqueries, removed unneeded comments * Linting * Subquery removal * Fix linting error --------- Co-authored-by: Vik Vanderlinden <vikvanderlinden@users.noreply.github.com> Co-authored-by: Barry Pollard <barrypollard@google.com>
1 parent 1dd6917 commit e215418

File tree

73 files changed

+3527
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+3527
-0
lines changed

sql/2025/security/audit_issues.sql

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#standardSQL
2+
# Question: What Audit issues are present in the pages? https://chromedevtools.github.io/devtools-protocol/tot/Audits/#type-InspectorIssueCode
3+
SELECT
4+
client,
5+
issuename,
6+
COUNT(DISTINCT page) AS total_pages,
7+
COUNT(DISTINCT IF(LOWER(JSON_VALUE(issue.code)) = LOWER(issuename), page, NULL)) AS count_with_issue,
8+
COUNT(DISTINCT IF(LOWER(JSON_VALUE(issue.code)) = LOWER(issuename), page, NULL)) / COUNT(DISTINCT page) AS pct_with_issue
9+
FROM
10+
`httparchive.crawl.pages`,
11+
UNNEST(JSON_QUERY_ARRAY(payload._audit_issues)) AS issue,
12+
UNNEST([
13+
'CookieIssue',
14+
'MixedContentIssue',
15+
'ContentSecurityPolicyIssue',
16+
'CorsIssue'
17+
]) AS issuename
18+
WHERE
19+
date = '2025-07-01' AND
20+
is_root_page
21+
GROUP BY
22+
client,
23+
issuename
24+
ORDER BY
25+
client,
26+
pct_with_issue DESC
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#standardSQL
2+
# Section: Attack Preventions - Bot protection services
3+
# Question: Which bot protection services are used most often on mobile and desktop sites?
4+
# Notes: The Wappalyzer 'Security' category mostly contains bot protection services such as reCAPTCHA and Cloudflare Bot Management
5+
# Issue: Due to some updates to wappalyzer the 'Security' category now also contains 'HSTS' (security header) and 'Really Simple SSL & Security' in significant numbers. Do we want to filter them out?
6+
SELECT
7+
client,
8+
t.technology,
9+
COUNT(0) AS freq,
10+
total,
11+
COUNT(0) / total AS pct
12+
FROM
13+
`httparchive.crawl.pages`,
14+
UNNEST(technologies) AS t,
15+
UNNEST(t.categories) AS category
16+
JOIN (
17+
SELECT
18+
client,
19+
COUNT(0) AS total
20+
FROM
21+
`httparchive.crawl.pages`
22+
WHERE
23+
date = '2025-07-01' AND
24+
is_root_page
25+
GROUP BY
26+
client
27+
)
28+
USING (client)
29+
WHERE
30+
date = '2025-07-01' AND
31+
category = 'Security' AND
32+
is_root_page
33+
GROUP BY
34+
client,
35+
total,
36+
t.technology
37+
ORDER BY
38+
pct DESC

sql/2025/security/browser_logs.sql

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
SELECT
2+
client,
3+
COUNT(0) AS total_pages,
4+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Mixed Content')) AS mixed_content_count,
5+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Mixed Content')) / COUNT(0) AS mixed_content_freq,
6+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'has been blocked by CORS policy')) AS cors_blocked_count,
7+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'has been blocked by CORS policy')) / COUNT(0) AS cors_blocked_freq,
8+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Content Security Policy') OR REGEXP_CONTAINS(JSON_VALUE(logs, '$.text'), 'Content-Security-Policy')) AS content_security_policy_count,
9+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Content Security Policy') OR REGEXP_CONTAINS(JSON_VALUE(logs, '$.text'), 'Content-Security-Policy')) / COUNT(0) AS content_security_policy_freq,
10+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Error with Feature-Policy')) AS error_with_feature_policy_count,
11+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Error with Feature-Policy')) / COUNT(0) AS error_with_feature_policy_freq,
12+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Error with Permissions-Policy')) AS error_with_permissions_policy_count,
13+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Error with Permissions-Policy')) / COUNT(0) AS error_with_permissions_policy_freq,
14+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'because its MIME type')) AS x_content_type_count,
15+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'because its MIME type')) / COUNT(0) AS x_content_type_freq,
16+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'X-Frame-Options')) AS x_frame_options_count,
17+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'X-Frame-Options')) / COUNT(0) AS x_frame_options_freq,
18+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The request has been aborted')) AS navigator_credentials_count,
19+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The request has been aborted')) / COUNT(0) AS navigator_credentials_freq,
20+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The Cross-Origin-Opener-Policy has been ignored')) AS cross_originer_opener_count,
21+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The Cross-Origin-Opener-Policy has been ignored')) / COUNT(0) AS cross_originer_opener_freq,
22+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Document-Policy HTTP header') OR REGEXP_CONTAINS(JSON_VALUE(logs, '$.text'), 'Document Policy violation')) AS document_policy_count,
23+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'Document-Policy HTTP header') OR REGEXP_CONTAINS(JSON_VALUE(logs, '$.text'), 'Document Policy violation')) / COUNT(0) AS document_policy_freq,
24+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The resource has been blocked')) AS subresource_integrity_count,
25+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'The resource has been blocked')) / COUNT(0) AS subresource_integrity_freq,
26+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'This document requires')) AS trusted_types_count,
27+
COUNTIF(REGEXP_CONTAINS(JSON_VALUE(logs.text), 'This document requires')) / COUNT(0) AS trusted_types_freq
28+
FROM
29+
`httparchive.crawl.pages`,
30+
UNNEST(JSON_QUERY_ARRAY(payload, '$._browser_logs')) AS logs
31+
WHERE
32+
date = '2025-07-01' AND
33+
is_root_page
34+
GROUP BY
35+
client
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#standardSQL
2+
# Section: Attack preventions - Preventing attacks using Clear-Site-Data
3+
# Question: Which Clear-Site-Data header values are most prevalent?
4+
# Notes: Many used values are still invalid (without quotes). We only count each host-value pair once.
5+
SELECT
6+
client,
7+
response_headers.value AS csd_header,
8+
SUM(COUNT(DISTINCT NET.HOST(url))) OVER (PARTITION BY client) AS total_csd_headers,
9+
COUNT(DISTINCT NET.HOST(url)) AS freq,
10+
COUNT(DISTINCT NET.HOST(url)) / SUM(COUNT(DISTINCT NET.HOST(url))) OVER (PARTITION BY client) AS pct
11+
FROM
12+
`httparchive.crawl.requests`,
13+
UNNEST(response_headers) AS response_headers
14+
WHERE
15+
date = '2025-07-01' AND
16+
is_root_page AND
17+
# is_main_document AND # (Uncomment to only run on the main document response; majority of CSD headers are set on them)
18+
LOWER(response_headers.name) = 'clear-site-data'
19+
GROUP BY
20+
client,
21+
csd_header
22+
ORDER BY
23+
pct DESC
24+
LIMIT
25+
100
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#standardSQL
2+
# Section: Attack Preventions - Preventing attacks using Cross-Origin policies
3+
# Question: Which are the most common COEP values?
4+
# Note: Considers headers of main document responses only
5+
SELECT
6+
client,
7+
coep_header,
8+
SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_coep_headers,
9+
COUNT(DISTINCT host) AS freq,
10+
COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct
11+
FROM (
12+
SELECT
13+
client,
14+
NET.HOST(url) AS host,
15+
response_headers.value AS coep_header
16+
FROM
17+
`httparchive.crawl.requests`,
18+
UNNEST(response_headers) AS response_headers
19+
WHERE
20+
date = '2025-07-01' AND
21+
is_root_page AND
22+
is_main_document AND
23+
LOWER(response_headers.name) = 'cross-origin-embedder-policy'
24+
)
25+
GROUP BY
26+
client,
27+
coep_header
28+
ORDER BY
29+
pct DESC
30+
LIMIT
31+
100
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#standardSQL
2+
# Section: Cookies - Cookie Age
3+
# Question: How many cookies (total, hosts, pages) have negative Max-Age, Expires and real age (Max-Age has precedence over Expires) attributes?
4+
# Note: Query is expensive and slow (14TB). Query is inefficient (We create a result array of length 1 for each cookie-attribute for each cookie and then unnest it again; We could instead not use arrays and skip the unnesting).
5+
# Note: Some of the percentages are quite different to the old query; one of both might be broken (difficult to compare as both cannot operate on a shared dataset)
6+
CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC)
7+
RETURNS STRING DETERMINISTIC
8+
LANGUAGE js AS '''
9+
const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?<value>-*[0-9]+)/i);
10+
const regexExpires = new RegExp(/expires\\s*=\\s*(?<value>.*?)(;|$)/i);
11+
const cookieValues = [cookie_value];
12+
const result = {
13+
"maxAge": [],
14+
"expires": [],
15+
"realAge": []
16+
};
17+
cookieValues.forEach(cookie => {
18+
let maxAge = null;
19+
let expires = null;
20+
if (regexMaxAge.exec(cookie)) {
21+
maxAge = Number(regexMaxAge.exec(cookie)[1]);
22+
result["maxAge"].push(maxAge);
23+
}
24+
if (regexExpires.exec(cookie)) {
25+
expires = Math.round(Number(new Date(regexExpires.exec(cookie)[1])) / 1000) - epochOfRequest;
26+
result["expires"].push(Number.isSafeInteger(expires) ? expires : null);
27+
}
28+
if (maxAge) {
29+
result["realAge"].push(maxAge);
30+
} else if (expires) {
31+
result["realAge"].push(expires);
32+
}
33+
});
34+
return JSON.stringify(result);
35+
''';
36+
37+
WITH age_values AS (
38+
SELECT
39+
client,
40+
page,
41+
NET.HOST(url) AS host,
42+
getCookieAgeValues(response_headers.value, INT64(summary.startedDateTime)) AS values
43+
FROM
44+
`httparchive.crawl.requests`,
45+
UNNEST(response_headers) AS response_headers
46+
WHERE
47+
date = '2025-07-01' AND
48+
is_root_page AND
49+
LOWER(response_headers.name) = 'set-cookie'
50+
),
51+
52+
max_age_values AS (
53+
SELECT
54+
client,
55+
COUNTIF(SAFE_CAST(max_age_value AS NUMERIC) <= 0) AS count_negative_max_age,
56+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_max_age_cookies,
57+
COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, page, NULL)) AS num_max_age_pages,
58+
COUNT(DISTINCT page) AS total_max_age_pages,
59+
COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, host, NULL)) AS num_max_age_hosts,
60+
COUNT(DISTINCT host) AS total_max_age_hosts
61+
FROM age_values,
62+
UNNEST(JSON_QUERY_ARRAY(values, '$.maxAge')) AS max_age_value
63+
GROUP BY
64+
client
65+
ORDER BY
66+
client
67+
),
68+
69+
expires_values AS (
70+
SELECT
71+
client,
72+
COUNTIF(SAFE_CAST(expires_value AS NUMERIC) <= 0) AS count_negative_expires,
73+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_expires_cookies,
74+
COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, page, NULL)) AS num_expires_pages,
75+
COUNT(DISTINCT page) AS total_expires_pages,
76+
COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, host, NULL)) AS num_expires_hosts,
77+
COUNT(DISTINCT host) AS total_expires_hosts
78+
FROM age_values,
79+
UNNEST(JSON_QUERY_ARRAY(values, '$.expires')) AS expires_value
80+
GROUP BY
81+
client
82+
ORDER BY
83+
client
84+
),
85+
86+
real_age_values AS (
87+
SELECT
88+
client,
89+
COUNTIF(SAFE_CAST(real_age_value AS NUMERIC) <= 0) AS count_negative_real_age,
90+
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_real_age_cookies,
91+
COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, page, NULL)) AS num_real_age_pages,
92+
COUNT(DISTINCT page) AS total_real_age_pages,
93+
COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, host, NULL)) AS num_real_age_hosts,
94+
COUNT(DISTINCT host) AS total_real_age_hosts
95+
FROM age_values,
96+
UNNEST(JSON_QUERY_ARRAY(values, '$.realAge')) AS real_age_value
97+
GROUP BY
98+
client
99+
ORDER BY
100+
client
101+
)
102+
103+
SELECT
104+
client,
105+
count_negative_max_age,
106+
count_negative_max_age / total_max_age_cookies AS pct_negative_max_age,
107+
num_max_age_pages,
108+
num_max_age_pages / total_max_age_pages AS pct_max_age_pages,
109+
num_max_age_hosts,
110+
num_max_age_hosts / total_max_age_hosts AS pct_max_age_hosts,
111+
count_negative_expires,
112+
count_negative_expires / total_expires_cookies AS pct_negative_expires,
113+
num_expires_pages,
114+
num_expires_pages / total_expires_pages AS pct_expires_pages,
115+
num_expires_hosts,
116+
num_expires_hosts / total_expires_hosts AS pct_expires_hosts,
117+
count_negative_real_age,
118+
count_negative_real_age / total_real_age_cookies AS pct_negative_real_age,
119+
num_real_age_pages,
120+
num_real_age_pages / total_real_age_pages AS pct_real_age_pages,
121+
num_real_age_hosts,
122+
num_real_age_hosts / total_real_age_hosts AS pct_real_age_hosts
123+
FROM
124+
max_age_values
125+
JOIN expires_values
126+
USING (client)
127+
JOIN real_age_values
128+
USING (client)
129+
ORDER BY
130+
client

0 commit comments

Comments
 (0)