From 7bfb9e88a55c560c96483485401181c45fb1b9de Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:49:55 +0200 Subject: [PATCH 01/13] Include sql/** in lint workflow and lint only changed SQL files on PRs --- .github/workflows/lintsql.yml | 11 +++++++++-- sql/util/green_web_foundation.sql | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index a3b3c26064d..1bd6fdb14ec 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -7,7 +7,8 @@ on: workflow_dispatch: pull_request: paths: - src/requirements.txt + - src/requirements.txt + - 'sql/**' jobs: lint: name: Lint SQL @@ -28,4 +29,10 @@ jobs: startsWith(github.event.pull_request.title,'Bump sqlfluff') == true run: | pip install -r src/requirements.txt - sqlfluff lint sql -p 4 + if [ "${{ github.event_name }}" == "workflow_dispatch" ] || [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then + # Lint all SQL files for workflow_dispatch or sqlfluff bumps + sqlfluff lint sql -p 4 + else + # Lint only changed SQL files + git diff --name-only --diff-filter=ACMRT origin/${{ github.base_ref }}...HEAD | grep '\.sql$' | xargs -r sqlfluff lint + fi diff --git a/sql/util/green_web_foundation.sql b/sql/util/green_web_foundation.sql index ef21a82c9fd..d177332b5a6 100644 --- a/sql/util/green_web_foundation.sql +++ b/sql/util/green_web_foundation.sql @@ -2,7 +2,7 @@ SELECT -- noqa: AM04 DATE('2022-06-01') AS date, NET.HOST(LOWER(url)) AS host, NET.REG_DOMAIN(LOWER(url)) AS domain, - * + *, FROM # This is the raw database dump from GWF. `httparchive.almanac.green_web_foundation_raw` From 1ba5ef278855aa143d36f7ce854f5c4c95a4c346 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:52:34 +0200 Subject: [PATCH 02/13] Remove step-level if from Lint SQL --- .github/workflows/lintsql.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index 1bd6fdb14ec..4dd55d0f917 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -24,9 +24,6 @@ jobs: with: python-version: '3.12' - name: Lint SQL code - if: | - github.event_name == 'workflow_dispatch' || - startsWith(github.event.pull_request.title,'Bump sqlfluff') == true run: | pip install -r src/requirements.txt if [ "${{ github.event_name }}" == "workflow_dispatch" ] || [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then From 390d814e63cee3b5c23150a3395d71ee4b156b6b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:54:04 +0200 Subject: [PATCH 03/13] Add -q to pip install --- .github/workflows/lintsql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index 4dd55d0f917..5c8cd152c99 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -25,7 +25,7 @@ jobs: python-version: '3.12' - name: Lint SQL code run: | - pip install -r src/requirements.txt + pip install -r src/requirements.txt -q if [ "${{ github.event_name }}" == "workflow_dispatch" ] || [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then # Lint all SQL files for workflow_dispatch or sqlfluff bumps sqlfluff lint sql -p 4 From 0dd3af782c984a3befee09b1339c017dc9dfb7f7 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:59:15 +0200 Subject: [PATCH 04/13] Break long shell lines in Lint SQL workflow for readability --- .github/workflows/lintsql.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index 5c8cd152c99..5459eae3d05 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -26,10 +26,13 @@ jobs: - name: Lint SQL code run: | pip install -r src/requirements.txt -q - if [ "${{ github.event_name }}" == "workflow_dispatch" ] || [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then + if [ "${{ github.event_name }}" == "workflow_dispatch" ] || \ + [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then # Lint all SQL files for workflow_dispatch or sqlfluff bumps sqlfluff lint sql -p 4 else # Lint only changed SQL files - git diff --name-only --diff-filter=ACMRT origin/${{ github.base_ref }}...HEAD | grep '\.sql$' | xargs -r sqlfluff lint + git diff --name-only --diff-filter=ACMRT origin/${{ github.base_ref }}...HEAD \ + | grep '\.sql$' \ + | xargs -r sqlfluff lint fi From b97a9532d06d16f34a24b8665a8228395fd92a37 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:04:33 +0200 Subject: [PATCH 05/13] Disable SQLFluff in global super-linter and move Dependabot auto-merge for sqlfluff into Lint SQL workflow --- .github/workflows/linter.yml | 3 +-- .github/workflows/lintsql.yml | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index d76805d6272..501f6e79511 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -44,7 +44,7 @@ jobs: VALIDATE_MARKDOWN: true VALIDATE_PYTHON_PYLINT: true VALIDATE_PYTHON_FLAKE8: true - VALIDATE_SQLFLUFF: true + VALIDATE_SQLFLUFF: false VALIDATE_YAML: true dependabot: @@ -73,7 +73,6 @@ jobs: steps.metadata.outputs.update-type == 'version-update:semver-minor' ) && ( contains(steps.metadata.outputs.dependency-names, 'prettier') || - contains(steps.metadata.outputs.dependency-names, 'sqlfluff') || contains(steps.metadata.outputs.dependency-names, 'super-linter') ) run: gh pr merge --admin --squash "$PR_URL" diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index 5459eae3d05..1c80414f700 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -36,3 +36,34 @@ jobs: | grep '\.sql$' \ | xargs -r sqlfluff lint fi + + + dependabot: + name: Dependabot auto-merge + runs-on: ubuntu-latest + needs: lint + if: | + github.event.pull_request.user.login == 'dependabot[bot]' && + github.repository == 'HTTPArchive/almanac.httparchive.org' + + permissions: + contents: write + pull-requests: write + + steps: + - name: Dependabot metadata + id: metadata + uses: dependabot/fetch-metadata@v2 + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + + - name: Enable auto-merge for Dependabot PRs + if: | + ( + steps.metadata.outputs.update-type == 'version-update:semver-patch' || + steps.metadata.outputs.update-type == 'version-update:semver-minor' + ) && contains(steps.metadata.outputs.dependency-names, 'sqlfluff') + run: gh pr merge --admin --squash "$PR_URL" + env: + PR_URL: ${{github.event.pull_request.html_url}} + GH_TOKEN: ${{secrets.GITHUB_TOKEN}} From 1a737646d5fd5c939b9658d3fb4e13eed2d4b8a6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 23:07:12 +0200 Subject: [PATCH 06/13] remove explicit VALIDATE_SQLFLUFF:false --- .github/workflows/linter.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 501f6e79511..a218f418df7 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -44,7 +44,6 @@ jobs: VALIDATE_MARKDOWN: true VALIDATE_PYTHON_PYLINT: true VALIDATE_PYTHON_FLAKE8: true - VALIDATE_SQLFLUFF: false VALIDATE_YAML: true dependabot: From 7f4e2256a84cf3da3f89f873669547a3521129c9 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 21 Oct 2025 22:50:33 +0200 Subject: [PATCH 07/13] linting --- sql/2019/fonts/06_32.sql | 1 - sql/2021/css/image_dimension_popularity.sql | 1 - sql/2021/css/keyframes_positions.sql | 1 - sql/2022/jamstack/jamstack_random_1000_urls.sql | 2 +- sql/2024/jamstack/hugo_astro_next.sql | 2 +- sql/2024/jamstack/jamstack-overview.sql | 2 +- sql/2024/jamstack/jamstack_distribution_by_rank.sql | 2 +- sql/2024/jamstack/js_frameworks.sql | 2 +- sql/2024/jamstack/paas.sql | 2 +- sql/2024/jamstack/ssg.sql | 2 +- sql/2024/markup/content_encoding.sql | 2 +- sql/2024/sustainability/video_autoplay_values.sql | 1 - sql/util/green_web_foundation.sql | 2 +- 13 files changed, 9 insertions(+), 13 deletions(-) diff --git a/sql/2019/fonts/06_32.sql b/sql/2019/fonts/06_32.sql index 9d7f2207968..cf66279d4ff 100644 --- a/sql/2019/fonts/06_32.sql +++ b/sql/2019/fonts/06_32.sql @@ -20,4 +20,3 @@ FROM ( ORDER BY freq / total DESC ) -LIMIT 100 diff --git a/sql/2021/css/image_dimension_popularity.sql b/sql/2021/css/image_dimension_popularity.sql index cd5e1022cf7..bf895194450 100644 --- a/sql/2021/css/image_dimension_popularity.sql +++ b/sql/2021/css/image_dimension_popularity.sql @@ -56,4 +56,3 @@ FROM ( ORDER BY pct DESC ) -LIMIT 500 diff --git a/sql/2021/css/keyframes_positions.sql b/sql/2021/css/keyframes_positions.sql index 9aed35f2a02..e2893d033a4 100644 --- a/sql/2021/css/keyframes_positions.sql +++ b/sql/2021/css/keyframes_positions.sql @@ -56,4 +56,3 @@ FROM ( ORDER BY pct DESC ) -LIMIT 500 diff --git a/sql/2022/jamstack/jamstack_random_1000_urls.sql b/sql/2022/jamstack/jamstack_random_1000_urls.sql index 6c325bb6f73..60567a3ed75 100644 --- a/sql/2022/jamstack/jamstack_random_1000_urls.sql +++ b/sql/2022/jamstack/jamstack_random_1000_urls.sql @@ -5,4 +5,4 @@ FROM WHERE methodology = '2022' AND date = '2022-06-01' -LIMIT 1000 +LIMIT 1000 -- noqa: AM09 diff --git a/sql/2024/jamstack/hugo_astro_next.sql b/sql/2024/jamstack/hugo_astro_next.sql index 5279d59c405..385defca4a7 100644 --- a/sql/2024/jamstack/hugo_astro_next.sql +++ b/sql/2024/jamstack/hugo_astro_next.sql @@ -9,7 +9,7 @@ CREATE TEMPORARY FUNCTION GET_MAX_AGE(response_headers ARRAY Date: Tue, 21 Oct 2025 23:00:46 +0200 Subject: [PATCH 08/13] Restrict Lint SQL workflow PR path to .sql files --- .github/workflows/lintsql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index 1c80414f700..af229ddc7d7 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -8,7 +8,7 @@ on: pull_request: paths: - src/requirements.txt - - 'sql/**' + - 'sql/**.sql' jobs: lint: name: Lint SQL From a3be8494a8f111aea5e4dfc33e2d59f90fb89d2b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 26 Oct 2025 20:21:58 +0100 Subject: [PATCH 09/13] remove outer query --- sql/2019/fonts/06_32.sql | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/sql/2019/fonts/06_32.sql b/sql/2019/fonts/06_32.sql index cf66279d4ff..ecaf1551e5d 100644 --- a/sql/2019/fonts/06_32.sql +++ b/sql/2019/fonts/06_32.sql @@ -1,22 +1,19 @@ #standardSQL # 06_32: Top font hosts SELECT - * -FROM ( - SELECT - client, - NET.HOST(url) AS host, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - ROUND(COUNT(0) * 100 / SUM(COUNT(0)) OVER (PARTITION BY client), 2) AS pct - FROM - `httparchive.almanac.requests` - WHERE - date = '2019-07-01' AND - type = 'font' - GROUP BY - client, - host - ORDER BY - freq / total DESC -) + client, + NET.HOST(url) AS host, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + ROUND(COUNT(0) * 100 / SUM(COUNT(0)) OVER (PARTITION BY client), 2) AS pct +FROM + `httparchive.almanac.requests` +WHERE + date = '2019-07-01' AND + type = 'font' +GROUP BY + client, + host +ORDER BY + freq / total DESC +LIMIT 100 From 00f041b74c6228e30a24c19899944d1ec3b7760b Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:08:12 +0100 Subject: [PATCH 10/13] unwraped subqueries --- sql/2021/css/image_dimension_popularity.sql | 94 ++++++++++----------- sql/2021/css/keyframes_positions.sql | 52 ++++++------ 2 files changed, 69 insertions(+), 77 deletions(-) diff --git a/sql/2021/css/image_dimension_popularity.sql b/sql/2021/css/image_dimension_popularity.sql index bf895194450..facdc3f7f1e 100644 --- a/sql/2021/css/image_dimension_popularity.sql +++ b/sql/2021/css/image_dimension_popularity.sql @@ -1,58 +1,54 @@ #standardSQL # CSS-initiated image px dimension popularity SELECT - * + client, + height, + width, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct FROM ( SELECT client, - height, - width, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct - FROM ( - SELECT - client, - page, - url AS img_url, - JSON_VALUE(payload, '$._initiator') AS css_url - FROM - `httparchive.almanac.requests` - WHERE - date = '2021-07-01' AND - type = 'image' - ) - JOIN ( - SELECT - client, - page, - url AS css_url - FROM - `httparchive.almanac.requests` - WHERE - date = '2021-07-01' AND - type = 'css' - ) - USING (client, page, css_url) - JOIN ( - SELECT - _TABLE_SUFFIX AS client, - url AS page, - JSON_EXTRACT_SCALAR(image, '$.url') AS img_url, - SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalHeight') AS INT64) AS height, - SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalWidth') AS INT64) AS width - FROM - `httparchive.pages.2021_07_01_*`, - UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._Images'), '$')) AS image - ) - USING (client, page, img_url) + page, + url AS img_url, + JSON_VALUE(payload, '$._initiator') AS css_url + FROM + `httparchive.almanac.requests` WHERE - height IS NOT NULL AND - width IS NOT NULL - GROUP BY + date = '2021-07-01' AND + type = 'image' +) +JOIN ( + SELECT client, - height, - width - ORDER BY - pct DESC + page, + url AS css_url + FROM + `httparchive.almanac.requests` + WHERE + date = '2021-07-01' AND + type = 'css' +) +USING (client, page, css_url) +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + JSON_EXTRACT_SCALAR(image, '$.url') AS img_url, + SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalHeight') AS INT64) AS height, + SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalWidth') AS INT64) AS width + FROM + `httparchive.pages.2021_07_01_*`, + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._Images'), '$')) AS image ) +USING (client, page, img_url) +WHERE + height IS NOT NULL AND + width IS NOT NULL +GROUP BY + client, + height, + width +ORDER BY + pct DESC diff --git a/sql/2021/css/keyframes_positions.sql b/sql/2021/css/keyframes_positions.sql index e2893d033a4..9dd0cee28b5 100644 --- a/sql/2021/css/keyframes_positions.sql +++ b/sql/2021/css/keyframes_positions.sql @@ -24,35 +24,31 @@ try { '''; SELECT - * -FROM ( + client, + position, + COUNT(DISTINCT page) AS pages, + ANY_VALUE(total) AS total_pages, + COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + `httparchive.almanac.parsed_css`, + UNNEST(getKeyframePositions(css)) AS position +JOIN ( SELECT - client, - position, - COUNT(DISTINCT page) AS pages, - ANY_VALUE(total) AS total_pages, - COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct + _TABLE_SUFFIX AS client, + COUNT(0) AS total FROM - `httparchive.almanac.parsed_css`, - UNNEST(getKeyframePositions(css)) AS position - JOIN ( - SELECT - _TABLE_SUFFIX AS client, - COUNT(0) AS total - FROM - `httparchive.summary_pages.2021_07_01_*` - GROUP BY - client - ) - USING (client) - WHERE - date = '2021-07-01' + `httparchive.summary_pages.2021_07_01_*` GROUP BY - client, - position - ORDER BY - pct DESC + client ) +USING (client) +WHERE + date = '2021-07-01' +GROUP BY + client, + position +ORDER BY + pct DESC From 4e14b5a3214c92b154cd343a4c0029dfa103e7ae Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 6 Nov 2025 22:25:20 +0100 Subject: [PATCH 11/13] Update sql/2021/css/keyframes_positions.sql Co-authored-by: Barry Pollard --- sql/2021/css/keyframes_positions.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/css/keyframes_positions.sql b/sql/2021/css/keyframes_positions.sql index 9dd0cee28b5..8d6aaad3ec3 100644 --- a/sql/2021/css/keyframes_positions.sql +++ b/sql/2021/css/keyframes_positions.sql @@ -52,3 +52,4 @@ GROUP BY position ORDER BY pct DESC +LIMIT 500 From 7650eeae6e9fd565e9ec46f09bf9742b4d4d565a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 6 Nov 2025 22:25:43 +0100 Subject: [PATCH 12/13] Update sql/2021/css/image_dimension_popularity.sql Co-authored-by: Barry Pollard --- sql/2021/css/image_dimension_popularity.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/css/image_dimension_popularity.sql b/sql/2021/css/image_dimension_popularity.sql index facdc3f7f1e..bbd8be55e34 100644 --- a/sql/2021/css/image_dimension_popularity.sql +++ b/sql/2021/css/image_dimension_popularity.sql @@ -52,3 +52,4 @@ GROUP BY width ORDER BY pct DESC +LIMIT 500 From ac260fc15dc9d5772d75f750561abc482765b550 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Fri, 7 Nov 2025 11:15:52 +0000 Subject: [PATCH 13/13] Apply suggestion from @tunetheweb --- sql/2021/css/image_dimension_popularity.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2021/css/image_dimension_popularity.sql b/sql/2021/css/image_dimension_popularity.sql index bbd8be55e34..c5b5bb3e21a 100644 --- a/sql/2021/css/image_dimension_popularity.sql +++ b/sql/2021/css/image_dimension_popularity.sql @@ -52,4 +52,4 @@ GROUP BY width ORDER BY pct DESC -LIMIT 500 +LIMIT 500