@@ -981,7 +981,83 @@ ranked AS (
981981 FROM all_matches
982982),
983983
984- -- Canadian province code lookup (code → province name)
984+ -- UK: Look for dot-or-hyphen delimited province tokens inside *.uk hosts
985+ uk_nation_from_domain AS (
986+ SELECT
987+ p .page , p .host ,
988+ CASE
989+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )[a-z0-9-]+\. gov\. scot$' ) THEN ' Scotland'
990+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )[a-z0-9-]+\. nhs\. scot$' ) THEN ' Scotland'
991+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )parliament\. scot$' ) THEN ' Scotland'
992+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )police\. scot$' ) THEN ' Scotland'
993+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )[a-z0-9-]+\. gov\. wales$' ) THEN ' Wales'
994+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )[a-z0-9-]+\. llyw\. cymru$' ) THEN ' Wales'
995+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )[a-z0-9-]+\. nhs\. wales$' ) THEN ' Wales'
996+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )senedd\. (wales|cymru)$' ) THEN ' Wales'
997+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )nidirect\. gov\. uk$' ) THEN ' Northern Ireland'
998+ ELSE NULL
999+ END AS uk_nation
1000+ FROM pages_scored p
1001+ ),
1002+
1003+ -- Australia: Look for dot-or-hyphen delimited province tokens inside *.au hosts
1004+ au_state_map AS (
1005+ SELECT * FROM UNNEST([
1006+ STRUCT(' nsw' AS code, ' New South Wales' AS state),
1007+ (' vic' ,' Victoria' ), (' qld' ,' Queensland' ), (' sa' ,' South Australia' ),
1008+ (' wa' ,' Western Australia' ), (' tas' ,' Tasmania' ),
1009+ (' act' ,' Australian Capital Territory' ), (' nt' ,' Northern Territory' )
1010+ ])
1011+ ),
1012+ au_state_from_gov_au AS (
1013+ SELECT
1014+ p .page , p .host ,
1015+ m .state AS au_state
1016+ FROM pages_scored p
1017+ JOIN au_state_map m
1018+ ON LOWER (REGEXP_EXTRACT(p .host , r' (?i)(?:^|\. )(nsw|vic|qld|sa|wa|tas|act|nt)\. gov\. au$' )) = m .code
1019+ ),
1020+
1021+ -- Brazil: Look for dot-or-hyphen delimited province tokens inside *.br hosts
1022+ br_state_map AS (
1023+ SELECT * FROM UNNEST([
1024+ STRUCT(' ac' AS code, ' Acre' AS state),
1025+ (' al' ,' Alagoas' ), (' ap' ,' Amapá' ), (' am' ,' Amazonas' ), (' ba' ,' Bahia' ),
1026+ (' ce' ,' Ceará' ), (' df' ,' Distrito Federal' ), (' es' ,' Espírito Santo' ),
1027+ (' go' ,' Goiás' ), (' ma' ,' Maranhão' ), (' mt' ,' Mato Grosso' ),
1028+ (' ms' ,' Mato Grosso do Sul' ), (' mg' ,' Minas Gerais' ), (' pa' ,' Pará' ),
1029+ (' pb' ,' Paraíba' ), (' pr' ,' Paraná' ), (' pe' ,' Pernambuco' ),
1030+ (' pi' ,' Piauí' ), (' rj' ,' Rio de Janeiro' ), (' rn' ,' Rio Grande do Norte' ),
1031+ (' rs' ,' Rio Grande do Sul' ), (' ro' ,' Rondônia' ), (' rr' ,' Roraima' ),
1032+ (' sc' ,' Santa Catarina' ), (' sp' ,' São Paulo' ), (' se' ,' Sergipe' ), (' to' ,' Tocantins' )
1033+ ])
1034+ ),
1035+ br_state_from_gov_br AS (
1036+ SELECT
1037+ p .page , p .host ,
1038+ m .state AS br_state
1039+ FROM pages_scored p
1040+ JOIN br_state_map m
1041+ ON LOWER (REGEXP_EXTRACT(
1042+ p .host ,
1043+ ' (?i)(?:^|\\ .)(ac|al|ap|am|ba|ce|df|es|go|ma|mt|ms|mg|pa|pb|pr|pe|pi|rj|rn|rs|ro|rr|sc|sp|se|to)\\ .gov\\ .br$'
1044+ )) = m .code
1045+ ),
1046+
1047+ -- Spain: Look for dot-or-hyphen delimited province tokens inside *.es hosts
1048+ es_region_from_known AS (
1049+ SELECT p .page , p .host ,
1050+ CASE
1051+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )gencat\. cat$' ) THEN ' Catalonia'
1052+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )euskadi\. eus$' ) THEN ' Basque Country'
1053+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )xunta\. gal$' ) THEN ' Galicia'
1054+ WHEN REGEXP_CONTAINS(p .host , r' (?i)(^|\. )comunidad\. madrid$' ) THEN ' Community of Madrid'
1055+ ELSE NULL
1056+ END AS es_region
1057+ FROM pages_scored p
1058+ ),
1059+
1060+ -- Canadian: province code lookup (code → province name)
9851061ca_prov_map AS (
9861062 SELECT * FROM UNNEST([
9871063 STRUCT(' ab' AS code, ' Alberta' AS province),
@@ -992,8 +1068,6 @@ ca_prov_map AS (
9921068 (' yt' ,' Yukon' ), (' yk' ,' Yukon' ) -- accept either token
9931069 ])
9941070),
995-
996- -- Look for dot-or-hyphen delimited province tokens inside *.gc.ca hosts
9971071ca_prov_from_gc AS (
9981072 SELECT
9991073 p .page , p .host ,
@@ -1117,7 +1191,14 @@ domain_scores AS (
11171191-- Final SELECT
11181192SELECT DISTINCT
11191193 ds .bucket AS country,
1120- COALESCE(usc .us_state , cpc .ca_province ) AS subnational, -- single column
1194+ COALESCE(
1195+ uk .uk_nation ,
1196+ cpc .ca_province ,
1197+ usc .us_state ,
1198+ au .au_state ,
1199+ br .br_state ,
1200+ es .es_region
1201+ ) AS subnational,
11211202 ds .gov_domain ,
11221203 ds .page ,
11231204 ds .is_root_page ,
@@ -1126,27 +1207,10 @@ SELECT DISTINCT
11261207 ds .best_practices_score ,
11271208 ds .seo_score
11281209FROM domain_scores ds
1129- LEFT JOIN us_state_classified usc
1130- ON usc .page = ds .page AND usc .host = ds .host
1131- LEFT JOIN ca_province_classified cpc
1132- ON cpc .page = ds .page AND cpc .host = ds .host
1210+ LEFT JOIN uk_nation_from_domain uk ON uk .page = ds .page AND uk .host = ds .host
1211+ LEFT JOIN ca_province_classified cpc ON cpc .page = ds .page AND cpc .host = ds .host
1212+ LEFT JOIN us_state_classified usc ON usc .page = ds .page AND usc .host = ds .host
1213+ LEFT JOIN au_state_from_gov_au au ON au .page = ds .page AND au .host = ds .host
1214+ LEFT JOIN br_state_from_gov_br br ON br .page = ds .page AND br .host = ds .host
1215+ LEFT JOIN es_region_from_known es ON es .page = ds .page AND es .host = ds .host
11331216ORDER BY country, subnational, gov_domain, page;
1134-
1135- /*
1136- SELECT
1137- ds.bucket AS country,
1138- usc.us_state,
1139- ds.gov_domain,
1140- ds.page,
1141- ds.is_root_page,
1142- ds.performance_score,
1143- ds.accessibility_score,
1144- ds.best_practices_score,
1145- ds.seo_score
1146- FROM domain_scores ds
1147- LEFT JOIN us_state_classified usc
1148- ON usc.page = ds.page AND usc.host = ds.host
1149- -- optional
1150- -- WHERE ds.bucket = 'United States (USA)'
1151- ORDER BY country, us_state, gov_domain, page;
1152- */
0 commit comments