diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index d76805d6272..a218f418df7 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -44,7 +44,6 @@ jobs: VALIDATE_MARKDOWN: true VALIDATE_PYTHON_PYLINT: true VALIDATE_PYTHON_FLAKE8: true - VALIDATE_SQLFLUFF: true VALIDATE_YAML: true dependabot: @@ -73,7 +72,6 @@ jobs: steps.metadata.outputs.update-type == 'version-update:semver-minor' ) && ( contains(steps.metadata.outputs.dependency-names, 'prettier') || - contains(steps.metadata.outputs.dependency-names, 'sqlfluff') || contains(steps.metadata.outputs.dependency-names, 'super-linter') ) run: gh pr merge --admin --squash "$PR_URL" diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index a3b3c26064d..af229ddc7d7 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -7,7 +7,8 @@ on: workflow_dispatch: pull_request: paths: - src/requirements.txt + - src/requirements.txt + - 'sql/**.sql' jobs: lint: name: Lint SQL @@ -23,9 +24,46 @@ jobs: with: python-version: '3.12' - name: Lint SQL code - if: | - github.event_name == 'workflow_dispatch' || - startsWith(github.event.pull_request.title,'Bump sqlfluff') == true run: | - pip install -r src/requirements.txt - sqlfluff lint sql -p 4 + pip install -r src/requirements.txt -q + if [ "${{ github.event_name }}" == "workflow_dispatch" ] || \ + [[ "${{ github.event.pull_request.title }}" == Bump\ sqlfluff* ]]; then + # Lint all SQL files for workflow_dispatch or sqlfluff bumps + sqlfluff lint sql -p 4 + else + # Lint only changed SQL files + git diff --name-only --diff-filter=ACMRT origin/${{ github.base_ref }}...HEAD \ + | grep '\.sql$' \ + | xargs -r sqlfluff lint + fi + + + dependabot: + name: Dependabot auto-merge + runs-on: ubuntu-latest + needs: lint + if: | + github.event.pull_request.user.login == 'dependabot[bot]' && + github.repository == 'HTTPArchive/almanac.httparchive.org' + + permissions: + contents: write + pull-requests: write + + steps: + - name: Dependabot metadata + id: metadata + uses: dependabot/fetch-metadata@v2 + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + + - name: Enable auto-merge for Dependabot PRs + if: | + ( + steps.metadata.outputs.update-type == 'version-update:semver-patch' || + steps.metadata.outputs.update-type == 'version-update:semver-minor' + ) && contains(steps.metadata.outputs.dependency-names, 'sqlfluff') + run: gh pr merge --admin --squash "$PR_URL" + env: + PR_URL: ${{github.event.pull_request.html_url}} + GH_TOKEN: ${{secrets.GITHUB_TOKEN}} diff --git a/sql/2019/fonts/06_32.sql b/sql/2019/fonts/06_32.sql index 9d7f2207968..ecaf1551e5d 100644 --- a/sql/2019/fonts/06_32.sql +++ b/sql/2019/fonts/06_32.sql @@ -1,23 +1,19 @@ #standardSQL # 06_32: Top font hosts SELECT - * -FROM ( - SELECT - client, - NET.HOST(url) AS host, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - ROUND(COUNT(0) * 100 / SUM(COUNT(0)) OVER (PARTITION BY client), 2) AS pct - FROM - `httparchive.almanac.requests` - WHERE - date = '2019-07-01' AND - type = 'font' - GROUP BY - client, - host - ORDER BY - freq / total DESC -) + client, + NET.HOST(url) AS host, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + ROUND(COUNT(0) * 100 / SUM(COUNT(0)) OVER (PARTITION BY client), 2) AS pct +FROM + `httparchive.almanac.requests` +WHERE + date = '2019-07-01' AND + type = 'font' +GROUP BY + client, + host +ORDER BY + freq / total DESC LIMIT 100 diff --git a/sql/2021/css/image_dimension_popularity.sql b/sql/2021/css/image_dimension_popularity.sql index cd5e1022cf7..c5b5bb3e21a 100644 --- a/sql/2021/css/image_dimension_popularity.sql +++ b/sql/2021/css/image_dimension_popularity.sql @@ -1,59 +1,55 @@ #standardSQL # CSS-initiated image px dimension popularity SELECT - * + client, + height, + width, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct FROM ( SELECT client, - height, - width, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct - FROM ( - SELECT - client, - page, - url AS img_url, - JSON_VALUE(payload, '$._initiator') AS css_url - FROM - `httparchive.almanac.requests` - WHERE - date = '2021-07-01' AND - type = 'image' - ) - JOIN ( - SELECT - client, - page, - url AS css_url - FROM - `httparchive.almanac.requests` - WHERE - date = '2021-07-01' AND - type = 'css' - ) - USING (client, page, css_url) - JOIN ( - SELECT - _TABLE_SUFFIX AS client, - url AS page, - JSON_EXTRACT_SCALAR(image, '$.url') AS img_url, - SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalHeight') AS INT64) AS height, - SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalWidth') AS INT64) AS width - FROM - `httparchive.pages.2021_07_01_*`, - UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._Images'), '$')) AS image - ) - USING (client, page, img_url) + page, + url AS img_url, + JSON_VALUE(payload, '$._initiator') AS css_url + FROM + `httparchive.almanac.requests` WHERE - height IS NOT NULL AND - width IS NOT NULL - GROUP BY + date = '2021-07-01' AND + type = 'image' +) +JOIN ( + SELECT client, - height, - width - ORDER BY - pct DESC + page, + url AS css_url + FROM + `httparchive.almanac.requests` + WHERE + date = '2021-07-01' AND + type = 'css' +) +USING (client, page, css_url) +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + JSON_EXTRACT_SCALAR(image, '$.url') AS img_url, + SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalHeight') AS INT64) AS height, + SAFE_CAST(JSON_EXTRACT_SCALAR(image, '$.naturalWidth') AS INT64) AS width + FROM + `httparchive.pages.2021_07_01_*`, + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._Images'), '$')) AS image ) +USING (client, page, img_url) +WHERE + height IS NOT NULL AND + width IS NOT NULL +GROUP BY + client, + height, + width +ORDER BY + pct DESC LIMIT 500 diff --git a/sql/2021/css/keyframes_positions.sql b/sql/2021/css/keyframes_positions.sql index 9aed35f2a02..8d6aaad3ec3 100644 --- a/sql/2021/css/keyframes_positions.sql +++ b/sql/2021/css/keyframes_positions.sql @@ -24,36 +24,32 @@ try { '''; SELECT - * -FROM ( + client, + position, + COUNT(DISTINCT page) AS pages, + ANY_VALUE(total) AS total_pages, + COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + `httparchive.almanac.parsed_css`, + UNNEST(getKeyframePositions(css)) AS position +JOIN ( SELECT - client, - position, - COUNT(DISTINCT page) AS pages, - ANY_VALUE(total) AS total_pages, - COUNT(DISTINCT page) / ANY_VALUE(total) AS pct_pages, - COUNT(0) AS freq, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct + _TABLE_SUFFIX AS client, + COUNT(0) AS total FROM - `httparchive.almanac.parsed_css`, - UNNEST(getKeyframePositions(css)) AS position - JOIN ( - SELECT - _TABLE_SUFFIX AS client, - COUNT(0) AS total - FROM - `httparchive.summary_pages.2021_07_01_*` - GROUP BY - client - ) - USING (client) - WHERE - date = '2021-07-01' + `httparchive.summary_pages.2021_07_01_*` GROUP BY - client, - position - ORDER BY - pct DESC + client ) +USING (client) +WHERE + date = '2021-07-01' +GROUP BY + client, + position +ORDER BY + pct DESC LIMIT 500 diff --git a/sql/2022/jamstack/jamstack_random_1000_urls.sql b/sql/2022/jamstack/jamstack_random_1000_urls.sql index 6c325bb6f73..60567a3ed75 100644 --- a/sql/2022/jamstack/jamstack_random_1000_urls.sql +++ b/sql/2022/jamstack/jamstack_random_1000_urls.sql @@ -5,4 +5,4 @@ FROM WHERE methodology = '2022' AND date = '2022-06-01' -LIMIT 1000 +LIMIT 1000 -- noqa: AM09 diff --git a/sql/2024/jamstack/hugo_astro_next.sql b/sql/2024/jamstack/hugo_astro_next.sql index 5279d59c405..385defca4a7 100644 --- a/sql/2024/jamstack/hugo_astro_next.sql +++ b/sql/2024/jamstack/hugo_astro_next.sql @@ -9,7 +9,7 @@ CREATE TEMPORARY FUNCTION GET_MAX_AGE(response_headers ARRAY