From 1220a63c163f5e1ace6cc5ba82010e41d826ec05 Mon Sep 17 00:00:00 2001 From: lkitching Date: Thu, 6 Sep 2018 16:14:22 +0100 Subject: [PATCH 1/4] Add queries for validating data against CubiQL's expectations. Issue #127 - Add validation queries for use with rdf-validator which checks a data source against the requirements CubiQL makes on data cubes. The queries are templates which expect the relevant configuration values to be provided when executed. Add instructions to the README on running these validations. --- README.md | 29 +++++++++++++++++++ .../codelist_members_must_have_labels.sparql | 11 +++++++ validation/datasets_must_have_labels.sparql | 11 +++++++ ...ts_must_have_measure_type_component.sparql | 12 ++++++++ validation/dimensions_must_have_labels.sparql | 11 +++++++ validation/geo_values_must_have_labels.sparql | 16 ++++++++++ validation/measures_must_have_labels.sparql | 11 +++++++ ...ime_values_must_have_beginning_time.sparql | 16 ++++++++++ .../time_values_must_have_end_time.sparql | 16 ++++++++++ 9 files changed, 133 insertions(+) create mode 100644 validation/codelist_members_must_have_labels.sparql create mode 100644 validation/datasets_must_have_labels.sparql create mode 100644 validation/datasets_must_have_measure_type_component.sparql create mode 100644 validation/dimensions_must_have_labels.sparql create mode 100644 validation/geo_values_must_have_labels.sparql create mode 100644 validation/measures_must_have_labels.sparql create mode 100644 validation/time_values_must_have_beginning_time.sparql create mode 100644 validation/time_values_must_have_end_time.sparql diff --git a/README.md b/README.md index cd7bb85..549cd29 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,35 @@ During development `lein run` can be used instead of building the uberjar: The server hosts a GraphQL endpoint at http://localhost:PORT/graphql which follows the protocol described [here](http://graphql.org/learn/serving-over-http/). +## Data validation + +CubiQL currently makes assumptions about the data in the cubes it finds, namely that certain resources (e.g. dimensions and measures) have +an associated label, and that time and geograpical dimension values have a particular structure. When producing your own data for use +with CubiQL you may want to check that it conforms to the expectations CubiQL has. + +The [rdf-validator](https://github.com/Swirrl/rdf-validator) is a tool for running a collection of validation tests against a SPARQL endpoint. +CubiQL includes validation queries encoding its requirements in the `validation` directory. To run these validations against your data: + +1. Download the [latest version](https://github.com/Swirrl/rdf-validator/releases) of RDF validator +2. Clone the CubiQL repository or copy the files in the validation directory to your local machine +3. Define the CubiQL configuration file for your data. The required configuration keys are listed below +4. Run the RDF validator by specifying the location of the data, validation directory and CubiQL configuration e.g. + + java -jar rdf-validator-standalone.jar --endpoint my_data.ttl --suite validations/ --variables cubiql-config.edn + + The `--endpoint` parameter can refer to an RDF file, a folder containing RDF files or a remote SPARQL endpoint URI. + +The `cubiql-config.edn` file must contain the following keys: + + | Key | + |---------------------| + | :geo-dimension-uri | + | :time-dimension-uri | + | :codelist-label-uri | + | :dataset-label-uri | + +If you are not using time or geography dimensions in your runtime configuration, you should set `geo-dimension-uri` and/or `time-dimension-uri` to a dummy value. + ## License Copyright © 2017 Swirrl IT Ltd. diff --git a/validation/codelist_members_must_have_labels.sparql b/validation/codelist_members_must_have_labels.sparql new file mode 100644 index 0000000..113c29d --- /dev/null +++ b/validation/codelist_members_must_have_labels.sparql @@ -0,0 +1,11 @@ +PREFIX qb: +PREFIX rdfs: +PREFIX skos: + +SELECT ?this WHERE { + { ?codelist skos:member ?this } + UNION { ?this skos:inScheme ?codelist } + FILTER NOT EXISTS { + ?this <{{codelist-label-uri}}> ?label . + } +} diff --git a/validation/datasets_must_have_labels.sparql b/validation/datasets_must_have_labels.sparql new file mode 100644 index 0000000..f202460 --- /dev/null +++ b/validation/datasets_must_have_labels.sparql @@ -0,0 +1,11 @@ +# Datasets must have an associated label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DataSet . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/datasets_must_have_measure_type_component.sparql b/validation/datasets_must_have_measure_type_component.sparql new file mode 100644 index 0000000..7940145 --- /dev/null +++ b/validation/datasets_must_have_measure_type_component.sparql @@ -0,0 +1,12 @@ +# Datasets must have a qb:measureType component + +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DataSet . + FILTER NOT EXISTS { + ?this qb:structure ?dsd . + ?dsd qb:component ?comp . + ?comp qb:dimension qb:measureType . + } +} diff --git a/validation/dimensions_must_have_labels.sparql b/validation/dimensions_must_have_labels.sparql new file mode 100644 index 0000000..647b82d --- /dev/null +++ b/validation/dimensions_must_have_labels.sparql @@ -0,0 +1,11 @@ +# All dimension properties must have an associated label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DimensionProperty . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/geo_values_must_have_labels.sparql b/validation/geo_values_must_have_labels.sparql new file mode 100644 index 0000000..f252602 --- /dev/null +++ b/validation/geo_values_must_have_labels.sparql @@ -0,0 +1,16 @@ +PREFIX qb: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{geo-dimension-uri}}> ?this . + } + } + + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} + +# Finds all geographic dimension values which do not have a corresponding label + diff --git a/validation/measures_must_have_labels.sparql b/validation/measures_must_have_labels.sparql new file mode 100644 index 0000000..8a27b46 --- /dev/null +++ b/validation/measures_must_have_labels.sparql @@ -0,0 +1,11 @@ +# All measure properties should have a label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:MeasureProperty . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/time_values_must_have_beginning_time.sparql b/validation/time_values_must_have_beginning_time.sparql new file mode 100644 index 0000000..6cf5a49 --- /dev/null +++ b/validation/time_values_must_have_beginning_time.sparql @@ -0,0 +1,16 @@ +PREFIX qb: +PREFIX time: +PREFIX xsd: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{time-dimension-uri}}> ?this . + } + } + FILTER NOT EXISTS { + ?this time:hasBeginning ?begin . + ?begin time:inXSDDateTime ?begintime . + FILTER(datatype(?begintime) = xsd:dateTime) + } +} \ No newline at end of file diff --git a/validation/time_values_must_have_end_time.sparql b/validation/time_values_must_have_end_time.sparql new file mode 100644 index 0000000..8328398 --- /dev/null +++ b/validation/time_values_must_have_end_time.sparql @@ -0,0 +1,16 @@ +PREFIX qb: +PREFIX time: +PREFIX xsd: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{time-dimension-uri}}> ?this . + } + } + FILTER NOT EXISTS { + ?this time:hasEnd ?end . + ?end time:inXSDDateTime ?endtime . + FILTER(datatype(?endtime) = xsd:dateTime) + } +} \ No newline at end of file From 0959f1254003f973e2c9d76aff7f51d45e607d5e Mon Sep 17 00:00:00 2001 From: zeginis Date: Wed, 26 Sep 2018 18:15:32 +0300 Subject: [PATCH 2/4] Add new SPARQL queries 1. There is a code list that contains ONLY the concepts used at the cube 2.There is a code list for each qb:DimensionProperty --- ..._codelist_must_have_only_codes_used.sparql | 23 +++++++++++++++++++ .../dimensions_must_have_codelist.sparql | 12 ++++++++++ 2 files changed, 35 insertions(+) create mode 100644 validation/dimensions_codelist_must_have_only_codes_used.sparql create mode 100644 validation/dimensions_must_have_codelist.sparql diff --git a/validation/dimensions_codelist_must_have_only_codes_used.sparql b/validation/dimensions_codelist_must_have_only_codes_used.sparql new file mode 100644 index 0000000..7e156e2 --- /dev/null +++ b/validation/dimensions_codelist_must_have_only_codes_used.sparql @@ -0,0 +1,23 @@ +# The codelist of each dimension should contain only the codes used at the cube +# Check 1) if all codes used at the cube exist at the codelist and +# 2)all codes of the codelist appear at the cube +PREFIX qb: +PREFIX skos: + +Select distinct ?v where{ + {SELECT ?v WHERE { + ?q a qb:DataSet. + ?q qb:structure/qb:component ?comp. + ?comp qb:dimension|qb:attribute ?dim . + ?comp <{{codelist-predicate}}> ?list . + ?obs ?dim ?v. + FILTER NOT EXISTS { { ?list skos:member ?v } UNION { ?v skos:inScheme ?list } } + }} UNION + {SELECT distinct ?v WHERE { + ?q a qb:DataSet. + ?q qb:structure/qb:component ?comp. + ?comp qb:dimension|qb:attribute ?dim . + ?comp <{{codelist-predicate}}> ?list . + {?list skos:member ?v } UNION { ?v skos:inScheme ?list } + FILTER NOT EXISTS {?obs qb:dataSet ?q. ?obs ?dim ?v}}} +} \ No newline at end of file diff --git a/validation/dimensions_must_have_codelist.sparql b/validation/dimensions_must_have_codelist.sparql new file mode 100644 index 0000000..d0e7330 --- /dev/null +++ b/validation/dimensions_must_have_codelist.sparql @@ -0,0 +1,12 @@ +# All dimension properties must have an associated codelist +# The codelist can be associated to either the qb:ComponentSpecification or the qb:DimensionProperty + +PREFIX rdfs: +PREFIX qb: + +SELECT ?t WHERE { + ?this a <{{codelist-source}}> . + FILTER NOT EXISTS { + ?this <{{codelist-predicate}}> ?codelist . + } +} From e796dc311afeff90d79805c6ecb0b98d4aa578a4 Mon Sep 17 00:00:00 2001 From: lkitching Date: Mon, 1 Oct 2018 14:41:28 +0100 Subject: [PATCH 3/4] Fix validation queries for execution within rdf-validator. Move the comments for the dimension validation queries to the end of of the file. Comments before the query cause sesame to infer the wrong query type (i.e. a graph query instead of a tuple query) which results in the wrong accept headers being sent to the remote SPARQL endpoint. --- .../dimensions_codelist_must_have_only_codes_used.sparql | 9 +++++---- validation/dimensions_must_have_codelist.sparql | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/validation/dimensions_codelist_must_have_only_codes_used.sparql b/validation/dimensions_codelist_must_have_only_codes_used.sparql index 7e156e2..ad2c0e0 100644 --- a/validation/dimensions_codelist_must_have_only_codes_used.sparql +++ b/validation/dimensions_codelist_must_have_only_codes_used.sparql @@ -1,6 +1,3 @@ -# The codelist of each dimension should contain only the codes used at the cube -# Check 1) if all codes used at the cube exist at the codelist and -# 2)all codes of the codelist appear at the cube PREFIX qb: PREFIX skos: @@ -20,4 +17,8 @@ Select distinct ?v where{ ?comp <{{codelist-predicate}}> ?list . {?list skos:member ?v } UNION { ?v skos:inScheme ?list } FILTER NOT EXISTS {?obs qb:dataSet ?q. ?obs ?dim ?v}}} -} \ No newline at end of file +} + +# The codelist of each dimension should contain only the codes used at the cube +# Check 1) if all codes used at the cube exist at the codelist and +# 2)all codes of the codelist appear at the cube diff --git a/validation/dimensions_must_have_codelist.sparql b/validation/dimensions_must_have_codelist.sparql index d0e7330..ddf1053 100644 --- a/validation/dimensions_must_have_codelist.sparql +++ b/validation/dimensions_must_have_codelist.sparql @@ -1,12 +1,12 @@ -# All dimension properties must have an associated codelist -# The codelist can be associated to either the qb:ComponentSpecification or the qb:DimensionProperty - PREFIX rdfs: PREFIX qb: -SELECT ?t WHERE { +SELECT ?this WHERE { ?this a <{{codelist-source}}> . FILTER NOT EXISTS { ?this <{{codelist-predicate}}> ?codelist . } } + +# All dimension properties must have an associated codelist +# The codelist can be associated to either the qb:ComponentSpecification or the qb:DimensionProperty From 7a190da3e24300abf6ff1fa98306b33fc9f00ca3 Mon Sep 17 00:00:00 2001 From: zeginis Date: Tue, 9 Oct 2018 18:04:38 +0300 Subject: [PATCH 4/4] Each dimension is not required to have a codelist --- validation/dimensions_must_have_codelist.sparql | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 validation/dimensions_must_have_codelist.sparql diff --git a/validation/dimensions_must_have_codelist.sparql b/validation/dimensions_must_have_codelist.sparql deleted file mode 100644 index ddf1053..0000000 --- a/validation/dimensions_must_have_codelist.sparql +++ /dev/null @@ -1,12 +0,0 @@ -PREFIX rdfs: -PREFIX qb: - -SELECT ?this WHERE { - ?this a <{{codelist-source}}> . - FILTER NOT EXISTS { - ?this <{{codelist-predicate}}> ?codelist . - } -} - -# All dimension properties must have an associated codelist -# The codelist can be associated to either the qb:ComponentSpecification or the qb:DimensionProperty