diff --git a/mkdocs.yml b/mkdocs.yml index c81a99bc26..6d5f05b552 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -125,6 +125,9 @@ markdown_extensions: - name: tsvgz class: tsv format: !!python/name:bidsschematools.render.tsv.fence + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format - admonition - pymdownx.details plugins: @@ -137,6 +140,8 @@ plugins: - macros: module_name: tools/mkdocs_macros_bids/main on_error_fail: true + - panzoom: + full_screen: true - redirects: redirect_maps: "01-introduction.md": "introduction.md" diff --git a/pyproject.toml b/pyproject.toml index 10433b5dc3..1e14214bd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "mkdocs-macros-plugin", "mkdocs-redirects", "numpy", + "mkdocs-panzoom-plugin>=0.5.2", ] dynamic = ["version"] diff --git a/src/bids_metaschema.yaml b/src/bids_metaschema.yaml new file mode 100644 index 0000000000..4dcc6e9c69 --- /dev/null +++ b/src/bids_metaschema.yaml @@ -0,0 +1,1143 @@ +id: https://bids-specification.readthedocs.io/en/latest/schema +name: bids_metaschema +description: >- + LinkML metaschema for the Brain Imaging Data Structure (BIDS) schema. + This schema defines the structure and constraints for the BIDS schema + YAML/JSON files. +license: https://creativecommons.org/licenses/by/4.0/ + +prefixes: + linkml: https://w3id.org/linkml/ + bids: https://bids-specification.readthedocs.io/en/latest/schema/ + +default_prefix: bids +default_range: string + +imports: + - linkml:types + +enums: + + RequirementLevel: + description: >- + Requirement levels following RFC 2119. + permissible_values: + required: + description: The element MUST be present. + recommended: + description: The element SHOULD be present. + optional: + description: The element MAY be present. + deprecated: + description: The element SHOULD NOT be present. + + FormatType: + description: >- + Format identifiers whose patterns are defined in the BIDS schema + under objects.formats. + permissible_values: + index: + description: Non-negative integer, possibly zero-padded. + label: + description: Free-form label with alphanumeric characters. + boolean: + description: Boolean value. + integer: + description: Integer value. + number: + description: Numeric value. + string: + description: String value. + hed_version: + description: HED version string. + bids_uri: + description: BIDS URI. + dataset_relative: + description: Path relative to the dataset root. + date: + description: Date in ISO 8601 format. + datetime: + description: Date and time in ISO 8601 format. + file_relative: + description: Path relative to the current file. + participant_relative: + description: Path relative to the participant directory. + rrid: + description: Research Resource Identifier. + stimuli_relative: + description: Path relative to the stimuli directory. + time: + description: Time in ISO 8601 format. + unit: + description: Unit of measurement. + uri: + description: Uniform Resource Identifier. + + IssueSeverity: + description: >- + Severity level for validation issues. + permissible_values: + error: + description: A rule violation that MUST be fixed. + warning: + description: A rule violation that SHOULD be fixed. + +classes: + + GeneralTerm: + description: >- + Base class for all BIDS schema terms. + Every term has a human-readable display name and a description. + attributes: + display_name: + range: string + required: true + description: A human-friendly name for display in tools. + description: + range: string + required: true + description: >- + A description of the term. May contain Markdown for rendering. + + # --- objects.* term types --- + + ValueTerm: + is_a: GeneralTerm + extra_slots: + allowed: true + description: >- + A term whose identity is a single string value. + Used for datatypes, extensions, and as a base for suffixes + and named enum values. + attributes: + value: + range: string + required: true + description: The string value of the term. + + Datatype: + is_a: ValueTerm + description: >- + A subdirectory that organizes files by acquisition type, + such as anat, eeg, or func. + + Extension: + is_a: ValueTerm + description: >- + A filename component describing the format of the file, + including the leading dot. + + Suffix: + is_a: ValueTerm + extra_slots: + allowed: true + description: >- + A filename suffix describing the contents of a file. + attributes: + unit: + range: string + description: Interpretation of numeric values in the file. + anyOf: + range: JsonSchema + multivalued: true + inlined_as_list: true + description: >- + Multiple permissible unit or type descriptions. + maxValue: + range: float + description: Maximum permissible value in a data file. + minValue: + range: float + description: Minimum permissible value in a data file. + + NameValueTerm: + is_a: GeneralTerm + extra_slots: + allowed: true + description: >- + A term where a name, when present, has a given meaning and its + value may be restricted. Base for entities, metadata fields, + and columns. + attributes: + name: + range: string + required: true + description: >- + The name of the term as it appears in the specification + and in a dataset. + type: + range: string + description: >- + The JSON Schema type of the value, such as string, integer, + number, object, array, or boolean. + format: + range: FormatType + description: >- + The permissible format of the value, from objects.formats. + enum: + range: string + multivalued: true + inlined_as_list: true + description: >- + Exclusive list of valid values. + anyOf: + range: JsonSchema + multivalued: true + inlined_as_list: true + description: >- + A list of type constraints, any of which could apply. + items: + range: JsonSchema + description: >- + Constraints on array element types. + properties: + range: JsonSchema + description: >- + Constraints on object field types. Values are arbitrary + JSON Schema objects. + additionalProperties: + range: JsonSchema + description: >- + Constraints on values of arbitrary object keys. + maximum: + range: float + description: Maximum for numeric values. + minimum: + range: float + description: Minimum for numeric values. + exclusiveMinimum: + range: float + description: Exclusive minimum for numeric values. + maxItems: + range: integer + description: Maximum number of items in an array. + minItems: + range: integer + description: Minimum number of items in an array. + required: + range: string + multivalued: true + inlined_as_list: true + description: >- + List of required property names for object-typed values. + recommended: + range: string + multivalued: true + inlined_as_list: true + description: >- + List of recommended property names for object-typed values. + + Entity: + is_a: NameValueTerm + description: >- + A name-value pair appearing in filenames. + slot_usage: + format: + required: true + + MetadataField: + is_a: NameValueTerm + extra_slots: + allowed: true + description: >- + A name-value pair appearing in JSON sidecar files. + attributes: + unit: + range: string + description: Interpretation of numeric values. + + Column: + is_a: NameValueTerm + extra_slots: + allowed: true + description: >- + A column heading and its value constraints in TSV files. + attributes: + unit: + range: string + description: Interpretation of numeric values. + pattern: + range: string + description: Regular expression constraining string values. + definition: + range: JsonSchema + description: >- + A JSON object describing the column as in the Tabular files + section of Common principles. Mutually exclusive with type. + + Format: + is_a: GeneralTerm + description: >- + Defines the form that values of a given format may take. + attributes: + pattern: + range: string + required: true + description: Regular expression validating string values. + + FileObject: + is_a: GeneralTerm + description: >- + Files and directories that may appear at the root of a dataset. + attributes: + file_type: + range: string + required: true + description: >- + Whether the file is a regular file or a directory. + + EnumValue: + is_a: ValueTerm + extra_slots: + allowed: true + description: >- + A named enumerated value with full description, + used in objects.enums. + attributes: + tags: + range: string + multivalued: true + inlined_as_list: true + description: >- + Tags for categorizing the enum value. + + PrivateEnum: + description: >- + A private (underscore-prefixed) enum definition embedding + JSON Schema type and enum constraints. Used for reusable + value lists in objects.enums. + attributes: + type: + range: string + required: true + description: The JSON Schema type (typically string). + enum: + range: string + multivalued: true + inlined_as_list: true + required: true + description: The list of permissible values. + + JsonSchema: + description: >- + An arbitrary JSON Schema fragment. Used for embedded type + constraints in metadata fields, columns, and suffixes. + This is intentionally loosely typed to allow any valid + JSON Schema construct. + + # --- rules.* types --- + + Issue: + description: >- + A validation issue with a code, message, and severity. + attributes: + code: + range: string + required: true + description: Issue identifier such as EVENTS_TSV_MISSING. + message: + range: string + required: true + description: Message for display to a user. + level: + range: IssueSeverity + description: >- + Issue severity. May be omitted when the level is + determined by the containing rule. + + FieldSpec: + description: >- + Specification of a sidecar field or tabular column within a rule. + Can be either a plain requirement level string or an object with + additional details. + attributes: + level: + range: RequirementLevel + required: true + description: Requirement level of the field. + level_addendum: + range: string + description: >- + Additional text describing when the requirement level changes. + description_addendum: + range: string + description: >- + Additional text appended to the field description from objects. + issue: + range: Issue + inlined: true + description: >- + An issue to raise if this field violates the rule. + + EntityOverride: + description: >- + An entity constraint within a file rule, specifying the + requirement level and optionally restricting valid values. + attributes: + level: + range: RequirementLevel + required: true + description: Requirement level for the entity. + enum: + range: string + multivalued: true + inlined_as_list: true + description: Exclusive list of valid entity values. + + SuffixRule: + description: >- + A filename rule specifying valid combinations of suffixes, + extensions, datatypes, and entities for BIDS files. + attributes: + suffixes: + range: string + multivalued: true + inlined_as_list: true + required: true + description: List of valid suffixes. + extensions: + range: string + multivalued: true + inlined_as_list: true + required: true + description: List of valid extensions including the leading dot. + datatypes: + range: string + multivalued: true + inlined_as_list: true + description: List of valid datatypes. + entities: + range: EntityRequirementMap + inlined: true + description: >- + Map from entity name to requirement level or override object. + level: + range: RequirementLevel + description: Requirement level of the file. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine whether this rule applies. + + PathRule: + description: >- + A rule for a file identified by its exact path relative to + the dataset root. + attributes: + path: + range: string + required: true + description: >- + Location of the file relative to the dataset root. + level: + range: RequirementLevel + required: true + description: Requirement level of the file. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine whether this rule applies. + + StemRule: + description: >- + A rule for a file identified by its stem (name without + extension) relative to the dataset root. + attributes: + stem: + range: string + required: true + description: >- + Name of the file up to but not including the extension. + extensions: + range: string + multivalued: true + inlined_as_list: true + required: true + description: List of valid extensions. + level: + range: RequirementLevel + required: true + description: Requirement level of the file. + datatypes: + range: string + multivalued: true + inlined_as_list: true + description: List of valid datatypes. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine whether this rule applies. + + SidecarRule: + description: >- + A rule specifying required, recommended, or optional metadata + fields in JSON sidecar files. + attributes: + selectors: + range: string + multivalued: true + inlined_as_list: true + required: true + description: >- + Expressions that determine whether this rule applies. + fields: + range: FieldRequirementMap + inlined: true + description: >- + Map from field name to requirement level or FieldSpec. + + TabularDataRule: + description: >- + A rule specifying required, recommended, or optional columns + in TSV files. + attributes: + selectors: + range: string + multivalued: true + inlined_as_list: true + required: true + description: >- + Expressions that determine whether this rule applies. + columns: + range: FieldRequirementMap + inlined: true + description: >- + Map from column name to requirement level or FieldSpec. + additional_columns: + range: string + required: true + description: >- + Whether extra columns are permitted. + One of allowed, allowed_if_defined, not_allowed, or n/a. + initial_columns: + range: string + multivalued: true + inlined_as_list: true + description: >- + Columns that must appear first, in order. + index_columns: + range: string + multivalued: true + inlined_as_list: true + description: >- + Columns that uniquely identify a row. + + CheckRule: + description: >- + A validation rule with selectors, checks, and an issue. + attributes: + issue: + range: Issue + inlined: true + required: true + description: >- + The issue to raise when any check expression fails. + selectors: + range: string + multivalued: true + inlined_as_list: true + required: true + description: >- + Expressions that determine whether this rule applies. + checks: + range: string + multivalued: true + inlined_as_list: true + required: true + description: >- + Expressions that must all evaluate to true. + + ErrorDefinition: + description: >- + An error or warning that cannot be expressed purely in the + schema but should be reported consistently by validators. + attributes: + code: + range: string + description: Error code identifier. + message: + range: string + required: true + description: Message for display to a user. + level: + range: IssueSeverity + required: true + description: Severity of the error. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine when this error applies. + + ModalityMapping: + description: >- + Maps a modality to its constituent datatypes. + attributes: + datatypes: + range: string + multivalued: true + inlined_as_list: true + required: true + description: List of datatypes belonging to this modality. + + DirectoryEntry: + extra_slots: + allowed: true + description: >- + A directory entry in the dataset directory structure. + attributes: + name: + range: string + description: The directory name. + level: + range: RequirementLevel + description: Requirement level of the directory. + opaque: + range: boolean + description: >- + Whether the directory contents are outside BIDS scope. + subdirs: + range: string + multivalued: true + inlined_as_list: true + description: List of subdirectory names. + entity: + range: string + description: Entity associated with this directory level. + value: + range: string + description: Value or pattern for the directory name. + + # --- meta.* types --- + + AssociationTarget: + description: >- + The target file specification for a file association. + attributes: + suffix: + range: string + description: The suffix of the associated file. + extension: + description: >- + The extension(s) of the associated file. Can be a single + string or a list of valid extensions. + entities: + range: string + multivalued: true + inlined_as_list: true + description: >- + Entities to propagate when finding the associated file. + + Association: + description: >- + Defines how associated files (such as events.tsv for a BOLD + file) are discovered via the inheritance principle. + attributes: + target: + range: AssociationTarget + inlined: true + required: true + description: The target file to look for. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine when this association applies. + inherit: + range: boolean + description: >- + Whether to apply the inheritance principle when searching. + + ExpressionTest: + description: >- + A test case for the expression language, pairing an + expression string with its expected result. + attributes: + expression: + range: string + required: true + description: The expression to evaluate. + result: + description: >- + The expected result of evaluating the expression. + Can be any JSON value including null. + + Template: + extra_slots: + allowed: true + description: >- + A reusable partial rule fragment, typically containing + entity requirements and optional selectors. May also + include suffixes and extensions for derivative templates. + Note: relevant only to the YAML presentation and not detailed + in original metaschema.json. ATM used only for FileRule's, and like + "abstract class" or "Interface" which is then mixed in. + attributes: + entities: + range: TemplateEntityMap + inlined: true + description: >- + Map from entity name to requirement level. + selectors: + range: string + multivalued: true + inlined_as_list: true + description: >- + Expressions that determine when this template applies. + suffixes: + range: string + multivalued: true + inlined_as_list: true + description: >- + List of valid suffixes for this template. + extensions: + range: string + multivalued: true + inlined_as_list: true + description: >- + List of valid extensions for this template. + + # --- Map wrapper classes --- + # These thin classes model JSON objects with arbitrary string keys + # and typed values. LinkML's extra_slots generates the correct + # additionalProperties in JSON Schema. + + ColumnMap: + extra_slots: + range_expression: + range: Column + description: >- + Map from column name to Column definitions. + + GeneralTermMap: + extra_slots: + range_expression: + range: GeneralTerm + description: >- + Map from term name to GeneralTerm definitions. + + DatatypeMap: + extra_slots: + range_expression: + range: Datatype + description: >- + Map from datatype name to Datatype definitions. + + EntityMap: + extra_slots: + range_expression: + range: Entity + description: >- + Map from entity name to Entity definitions. + + ExtensionMap: + extra_slots: + range_expression: + range: Extension + description: >- + Map from extension name to Extension definitions. + + FileObjectMap: + extra_slots: + range_expression: + range: FileObject + description: >- + Map from file name to FileObject definitions. + + FormatMap: + extra_slots: + range_expression: + range: Format + description: >- + Map from format name to Format definitions. + + MetadataFieldMap: + extra_slots: + range_expression: + range: MetadataField + description: >- + Map from field name to MetadataField definitions. + + SuffixMap: + extra_slots: + range_expression: + range: Suffix + description: >- + Map from suffix name to Suffix definitions. + + AssociationMap: + extra_slots: + range_expression: + range: Association + description: >- + Map from association name to Association objects. + + SidecarRuleMap: + extra_slots: + range_expression: + range: SidecarRule + description: >- + Map from rule name to SidecarRule objects. + + ErrorDefinitionMap: + extra_slots: + range_expression: + range: ErrorDefinition + description: >- + Map from error name to ErrorDefinition objects. + + ModalityMappingMap: + extra_slots: + range_expression: + range: ModalityMapping + description: >- + Map from modality name to ModalityMapping objects. + + EnumMap: + extra_slots: + allowed: true + description: >- + Map from enum name to EnumValue or PrivateEnum definitions. + Uses open additionalProperties since values can be either type. + + # Category 2: Nested map wrapper classes (group -> name -> TypedValue) + + TemplateMap: + extra_slots: + range_expression: + range: Template + description: >- + Map from template name to Template objects. + + TemplateGroupMap: + extra_slots: + range_expression: + range: TemplateMap + description: >- + Map from template category to TemplateMap. + + CheckRuleMap: + extra_slots: + range_expression: + range: CheckRule + description: >- + Map from check rule name to CheckRule objects. + + CheckRuleGroupMap: + extra_slots: + range_expression: + range: CheckRuleMap + description: >- + Map from check group name to CheckRuleMap. + + DirectoryEntryMap: + extra_slots: + range_expression: + range: DirectoryEntry + description: >- + Map from directory name to DirectoryEntry objects. + + DirectoryGroupMap: + extra_slots: + range_expression: + range: DirectoryEntryMap + description: >- + Map from directory group name to DirectoryEntryMap. + + SidecarRuleGroupMap: + extra_slots: + range_expression: + range: SidecarRuleMap + description: >- + Map from sidecar group name to SidecarRuleMap. + + TabularDataRuleMap: + extra_slots: + range_expression: + range: TabularDataRule + description: >- + Map from rule name to TabularDataRule objects. + + TabularDataRuleGroupMap: + extra_slots: + range_expression: + range: TabularDataRuleMap + description: >- + Map from tabular data group name to TabularDataRuleMap. + + SuffixRuleMap: + extra_slots: + range_expression: + range: SuffixRule + description: >- + Map from suffix rule name to SuffixRule objects. + + SuffixRuleGroupMap: + extra_slots: + range_expression: + range: SuffixRuleMap + description: >- + Map from file rule group name to SuffixRuleMap. + + FileRuleGroupMap: + extra_slots: + allowed: true + description: >- + Map from file rule group name to open map of rules. + Used for common file rules where rule structure varies. + + # Category 3: Union-valued map wrapper classes + + EntityRequirementMap: + extra_slots: + range_expression: + any_of: + - range: RequirementLevel + - range: EntityOverride + description: >- + Map from entity name to RequirementLevel or EntityOverride. + + FieldRequirementMap: + extra_slots: + range_expression: + any_of: + - range: RequirementLevel + - range: FieldSpec + description: >- + Map from field/column name to RequirementLevel or FieldSpec. + + TemplateEntityMap: + extra_slots: + range_expression: + range: RequirementLevel + description: >- + Map from entity name to RequirementLevel for templates. + + # --- Container classes --- + + MetaSection: + description: >- + The meta section of the BIDS schema, containing associations, + context, expression tests, templates, and version history. + attributes: + associations: + range: AssociationMap + inlined: true + description: >- + Map from association name to Association objects. + context: + description: >- + The context object is itself a JSON Schema defining the + namespace available to rule expressions. + expression_tests: + range: ExpressionTest + multivalued: true + inlined_as_list: true + description: >- + Test cases for the expression language. + templates: + range: TemplateGroupMap + inlined: true + description: >- + Nested map of template category to template name to + Template objects. + versions: + range: string + multivalued: true + inlined_as_list: true + description: >- + Version history as a list of semantic version strings. + + ObjectsSection: + description: >- + The objects section containing all term definitions. + attributes: + columns: + range: ColumnMap + inlined: true + description: >- + Map from column name to Column definitions. + common_principles: + range: GeneralTermMap + inlined: true + description: >- + Map from term name to GeneralTerm definitions. + datatypes: + range: DatatypeMap + inlined: true + description: >- + Map from datatype name to Datatype definitions. + entities: + range: EntityMap + inlined: true + description: >- + Map from entity name to Entity definitions. + enums: + range: EnumMap + inlined: true + description: >- + Map from enum name to EnumValue or PrivateEnum definitions. + extensions: + range: ExtensionMap + inlined: true + description: >- + Map from extension name to Extension definitions. + files: + range: FileObjectMap + inlined: true + description: >- + Map from file name to FileObject definitions. + formats: + range: FormatMap + inlined: true + description: >- + Map from format name to Format definitions. + metadata: + range: MetadataFieldMap + inlined: true + description: >- + Map from field name to MetadataField definitions. + metaentities: + range: GeneralTermMap + inlined: true + description: >- + Map from metaentity name to GeneralTerm definitions. + modalities: + range: GeneralTermMap + inlined: true + description: >- + Map from modality name to GeneralTerm definitions. + suffixes: + range: SuffixMap + inlined: true + description: >- + Map from suffix name to Suffix definitions. + + FileRulesSection: + description: >- + Container for all file naming rules, organized by category. + attributes: + common: + range: FileRuleGroupMap + inlined: true + description: >- + Common file rules including core files and tables. + raw: + range: SuffixRuleGroupMap + inlined: true + description: >- + Raw data file rules, organized by datatype group. + deriv: + range: SuffixRuleGroupMap + inlined: true + description: >- + Derivative data file rules, organized by datatype group. + + RulesSection: + description: >- + The rules section containing validation and structural rules. + attributes: + checks: + range: CheckRuleGroupMap + inlined: true + description: >- + Map from check group name to map of check name + to CheckRule objects. + common_principles: + range: string + multivalued: true + inlined_as_list: true + description: >- + Ordered list of common principle term names. + dataset_metadata: + range: SidecarRuleMap + inlined: true + description: >- + Sidecar-like rules for dataset-level JSON files. + directories: + range: DirectoryGroupMap + inlined: true + description: >- + Directory structure rules for BIDS datasets. + entities: + range: string + multivalued: true + inlined_as_list: true + description: >- + Ordered list of entity names defining filename order. + errors: + range: ErrorDefinitionMap + inlined: true + description: >- + Map from error name to ErrorDefinition objects. + files: + range: FileRulesSection + inlined: true + description: >- + File naming rules organized by common/raw/deriv. + json: + range: SidecarRuleGroupMap + inlined: true + description: >- + Sidecar-like rules for JSON schema validation. + metaentities: + range: string + multivalued: true + inlined_as_list: true + description: >- + Ordered list of metaentity term names. + modalities: + range: ModalityMappingMap + inlined: true + description: >- + Map from modality name to ModalityMapping objects. + sidecars: + range: SidecarRuleGroupMap + inlined: true + description: >- + Map from sidecar group to map of rule name + to SidecarRule objects. + tabular_data: + range: TabularDataRuleGroupMap + inlined: true + description: >- + Map from tabular group to map of rule name + to TabularDataRule objects. + + BidsSchema: + tree_root: true + description: >- + Top-level BIDS schema object, as produced by compiling + the schema YAML files via bst export. + attributes: + meta: + range: MetaSection + inlined: true + required: true + description: Meta-information about the schema. + objects: + range: ObjectsSection + inlined: true + required: true + description: All term and object definitions. + rules: + range: RulesSection + inlined: true + required: true + description: All validation and structural rules. + bids_version: + range: string + required: true + description: The BIDS specification version. + schema_version: + range: string + required: true + description: The schema structure version. diff --git a/src/schema/class_diagram.md b/src/schema/class_diagram.md new file mode 100644 index 0000000000..fe5d3663a1 --- /dev/null +++ b/src/schema/class_diagram.md @@ -0,0 +1,336 @@ +# BIDS LinkML Metaschema — Class Diagram + +Auto-generated from `bids_metaschema.yaml` by `gen_class_diagram.py`. + +Map wrapper classes (29 classes ending in "Map") are excluded for clarity. + +```mermaid +classDiagram + + %% === Data model classes === + + class GeneralTerm { + <> + +string display_name + +string description + } + + class ValueTerm { + <> + +string value + } + + class Datatype { + } + + class Extension { + } + + class Suffix { + <> + -string unit + -JsonSchema[] anyOf + -float maxValue + -float minValue + } + + class NameValueTerm { + +string name + -string type + -FormatType format + -string[] enum + -JsonSchema[] anyOf + -JsonSchema items + -JsonSchema properties + -JsonSchema additionalProperties + -float maximum + -float minimum + -float exclusiveMinimum + -int maxItems + -int minItems + -string[] required + -string[] recommended + } + + class Entity { + <> + +* format + } + + class MetadataField { + <> + -string unit + } + + class Column { + <> + -string unit + -string pattern + -JsonSchema definition + } + + class Format { + <> + +string pattern + } + + class FileObject { + +string file_type + } + + class EnumValue { + -string[] tags + } + + class PrivateEnum { + +string type + +string[] enum + } + + class JsonSchema { + <> + } + + class Issue { + <> + +string code + +string message + -IssueSeverity level + } + + class FieldSpec { + +RequirementLevel level + -string level_addendum + -string description_addendum + -Issue issue + } + + class EntityOverride { + +RequirementLevel level + -string[] enum + } + + class SuffixRule { + +string[] suffixes + +string[] extensions + -string[] datatypes + -Map~any~ entities + -RequirementLevel level + -string[] selectors + } + + class PathRule { + +string path + +RequirementLevel level + -string[] selectors + } + + class StemRule { + +string stem + +string[] extensions + +RequirementLevel level + -string[] datatypes + -string[] selectors + } + + class SidecarRule { + +string[] selectors + -Map~any~ fields + } + + class TabularDataRule { + +string[] selectors + -Map~any~ columns + +string additional_columns + -string[] initial_columns + -string[] index_columns + } + + class CheckRule { + <> + +Issue issue + +string[] selectors + +string[] checks + } + + class ErrorDefinition { + -string code + +string message + +IssueSeverity level + -string[] selectors + } + + class ModalityMapping { + <> + +string[] datatypes + } + + class DirectoryEntry { + <> + -string name + -RequirementLevel level + -bool opaque + -string[] subdirs + -string entity + -string value + } + + class AssociationTarget { + <> + -string suffix + -string extension + -string[] entities + } + + class Association { + +AssociationTarget target + -string[] selectors + -bool inherit + } + + class ExpressionTest { + +string expression + -string result + } + + class Template { + -Map~RequirementLevel~ entities + -string[] selectors + -string[] suffixes + -string[] extensions + } + + class MetaSection { + -Map~Association~ associations + -string context + -ExpressionTest[] expression_tests + -Map~Map~Template~~ templates + -string[] versions + } + + class ObjectsSection { + <> + -Map~Column~ columns + -Map~GeneralTerm~ common_principles + -Map~Datatype~ datatypes + -Map~Entity~ entities + -Map~any~ enums + -Map~Extension~ extensions + -Map~FileObject~ files + -Map~Format~ formats + -Map~MetadataField~ metadata + -Map~GeneralTerm~ metaentities + -Map~GeneralTerm~ modalities + -Map~Suffix~ suffixes + } + + class FileRulesSection { + <> + -Map~any~ common + -Map~Map~SuffixRule~~ raw + -Map~Map~SuffixRule~~ deriv + } + + class RulesSection { + <> + -Map~Map~CheckRule~~ checks + -string[] common_principles + -Map~SidecarRule~ dataset_metadata + -Map~Map~DirectoryEntry~~ directories + -string[] entities + -Map~ErrorDefinition~ errors + -FileRulesSection files + -Map~Map~SidecarRule~~ json + -string[] metaentities + -Map~ModalityMapping~ modalities + -Map~Map~SidecarRule~~ sidecars + -Map~Map~TabularDataRule~~ tabular_data + } + + class BidsSchema { + +MetaSection meta + +ObjectsSection objects + +RulesSection rules + +string bids_version + +string schema_version + } + + %% === Enums === + + class RequirementLevel { + <> + required + recommended + optional + deprecated + } + + class FormatType { + <> + index + label + boolean + integer + number + string + hed_version + bids_uri + dataset_relative + date + datetime + file_relative + participant_relative + rrid + stimuli_relative + time + unit + uri + } + + class IssueSeverity { + <> + error + warning + } + + %% === Inheritance === + + GeneralTerm <|-- ValueTerm + ValueTerm <|-- Datatype + ValueTerm <|-- Extension + ValueTerm <|-- Suffix + GeneralTerm <|-- NameValueTerm + NameValueTerm <|-- Entity + NameValueTerm <|-- MetadataField + NameValueTerm <|-- Column + GeneralTerm <|-- Format + GeneralTerm <|-- FileObject + ValueTerm <|-- EnumValue + + %% === Composition relationships === + + Suffix --> "*" JsonSchema : anyOf + NameValueTerm --> FormatType : format + NameValueTerm --> "*" JsonSchema : anyOf + NameValueTerm --> "1" JsonSchema : items + NameValueTerm --> "1" JsonSchema : properties + NameValueTerm --> "1" JsonSchema : additionalProperties + Column --> "1" JsonSchema : definition + Issue --> IssueSeverity : level + FieldSpec --> RequirementLevel : level + FieldSpec *-- "1" Issue : issue + EntityOverride --> RequirementLevel : level + SuffixRule --> RequirementLevel : level + PathRule --> RequirementLevel : level + StemRule --> RequirementLevel : level + CheckRule *-- "1" Issue : issue + ErrorDefinition --> IssueSeverity : level + DirectoryEntry --> RequirementLevel : level + Association *-- "1" AssociationTarget : target + MetaSection --> "*" ExpressionTest : expression_tests + RulesSection *-- "1" FileRulesSection : files + BidsSchema *-- "1" MetaSection : meta + BidsSchema *-- "1" ObjectsSection : objects + BidsSchema *-- "1" RulesSection : rules + +``` diff --git a/src/schema/gen_class_diagram.py b/src/schema/gen_class_diagram.py new file mode 100644 index 0000000000..f8cdfa5fc4 --- /dev/null +++ b/src/schema/gen_class_diagram.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +"""Generate a Mermaid class diagram from the BIDS LinkML metaschema. + +Uses PyYAML to parse bids_metaschema.yaml (since the schema uses draft +``extra_slots`` features not yet in the released linkml-runtime) and +produces a Mermaid ``classDiagram`` showing: + +- Class inheritance hierarchy +- Key attributes for each class +- Enums with their permissible values +- Composition relationships (slots with class ranges) + +Map wrapper classes (names ending in "Map") are excluded to keep the +diagram focused on the data model. + +Usage:: + + uv run python src/schema/gen_class_diagram.py +""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +SCHEMA_PATH = Path(__file__).parent.parent / "bids_metaschema.yaml" +OUTPUT_PATH = Path(__file__).parent / "class_diagram.md" + + +def load_schema(path: Path) -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +def is_map_wrapper(name: str) -> bool: + """Return True for map wrapper classes we want to exclude.""" + return name.endswith("Map") + + +def resolve_map_value_type(map_class_name: str, all_classes: dict) -> str | None: + """For a Map wrapper class, return the value type it wraps. + + Looks at extra_slots.range_expression.range to find what type + the map values are. Returns None if the map is open/untyped. + """ + cls = all_classes.get(map_class_name, {}) + extra = cls.get("extra_slots", {}) + if not isinstance(extra, dict): + return None + range_expr = extra.get("range_expression", {}) + if isinstance(range_expr, dict): + return range_expr.get("range") + return None + + +def mermaid_type(range_val: str | None, default: str = "string") -> str: + """Convert a LinkML range to a short Mermaid-friendly type string.""" + if range_val is None: + return default + # Simplify common LinkML built-in types + simple = { + "string": "string", + "integer": "int", + "float": "float", + "boolean": "bool", + } + return simple.get(range_val, range_val) + + +def format_attr(name: str, attr: dict) -> str: + """Format a single attribute line for a Mermaid class block.""" + range_val = mermaid_type(attr.get("range")) + multivalued = attr.get("multivalued", False) + required = attr.get("required", False) + + if multivalued: + type_str = f"{range_val}[]" + else: + type_str = range_val + + # Use +/- for required/optional visibility + visibility = "+" if required else "-" + return f" {visibility}{type_str} {name}" + + +def generate_diagram(schema: dict) -> str: + classes = schema.get("classes", {}) + enums = schema.get("enums", {}) + + lines: list[str] = [] + lines.append("classDiagram") + lines.append("") + + # Separate data model classes from map wrappers + data_classes = { + name: cls for name, cls in classes.items() if not is_map_wrapper(name) + } + + # --- Class blocks with attributes --- + lines.append(" %% === Data model classes ===") + lines.append("") + + for name, cls in data_classes.items(): + attrs = cls.get("attributes", {}) or {} + # Also include slot_usage keys for display + slot_usage = cls.get("slot_usage", {}) or {} + + lines.append(f" class {name} {{") + + # Show description as a note if short enough + desc = cls.get("description", "") + if desc: + # Truncate to first sentence + first_sentence = desc.strip().split(". ")[0].rstrip(".") + if len(first_sentence) <= 60: + lines.append(f" <<{first_sentence}>>") + + for attr_name, attr_def in attrs.items(): + if isinstance(attr_def, dict): + range_val = attr_def.get("range", "") + if isinstance(range_val, str) and is_map_wrapper(range_val): + # Resolve the map to its inner value type + inner = resolve_map_value_type(range_val, classes) + if inner and is_map_wrapper(inner): + # Nested map (e.g., GroupMap -> Map -> Type) + inner2 = resolve_map_value_type(inner, classes) + type_str = f"Map~Map~{inner2 or '?'}~~" + elif inner: + type_str = f"Map~{inner}~" + else: + type_str = "Map~any~" + lines.append(f" -{type_str} {attr_name}") + continue + lines.append(format_attr(attr_name, attr_def)) + + # Show slot_usage overrides + for su_name, su_def in slot_usage.items(): + if isinstance(su_def, dict) and su_def.get("required"): + lines.append(f" +* {su_name}") + + lines.append(" }") + lines.append("") + + # --- Enum blocks --- + lines.append(" %% === Enums ===") + lines.append("") + + for enum_name, enum_def in enums.items(): + lines.append(f" class {enum_name} {{") + lines.append(" <>") + pv = enum_def.get("permissible_values", {}) or {} + for val_name in pv: + lines.append(f" {val_name}") + lines.append(" }") + lines.append("") + + # --- Inheritance relationships --- + lines.append(" %% === Inheritance ===") + lines.append("") + + for name, cls in data_classes.items(): + is_a = cls.get("is_a") + if is_a and not is_map_wrapper(is_a): + lines.append(f" {is_a} <|-- {name}") + + lines.append("") + + # --- Composition relationships --- + # For attributes whose range is a non-Map data class or enum + lines.append(" %% === Composition relationships ===") + lines.append("") + + for name, cls in data_classes.items(): + attrs = cls.get("attributes", {}) or {} + for attr_name, attr_def in attrs.items(): + if not isinstance(attr_def, dict): + continue + range_val = attr_def.get("range", "") + if not isinstance(range_val, str): + continue + # Skip basic types + if range_val in ("string", "integer", "float", "boolean", ""): + continue + # If range is a data model class (not map wrapper) + if range_val in data_classes: + # Use composition for inlined, association otherwise + inlined = attr_def.get("inlined", False) + multivalued = attr_def.get("multivalued", False) + card = '"*"' if multivalued else '"1"' + if inlined: + lines.append( + f" {name} *-- {card} {range_val} : {attr_name}" + ) + else: + lines.append( + f" {name} --> {card} {range_val} : {attr_name}" + ) + # If range is an enum + elif range_val in enums: + lines.append(f" {name} --> {range_val} : {attr_name}") + + lines.append("") + + return "\n".join(lines) + + +def main() -> None: + schema = load_schema(SCHEMA_PATH) + diagram = generate_diagram(schema) + + output = f"""\ +# BIDS LinkML Metaschema — Class Diagram + +Auto-generated from `bids_metaschema.yaml` by `gen_class_diagram.py`. + +Map wrapper classes (29 classes ending in "Map") are excluded for clarity. + +```mermaid +{diagram} +``` +""" + OUTPUT_PATH.write_text(output) + print(f"Wrote {OUTPUT_PATH}") + print(f" Data model classes + enums visualized") + print(f" Map wrapper classes excluded") + + +if __name__ == "__main__": + main() diff --git a/src/schema/linkml_design.md b/src/schema/linkml_design.md new file mode 100644 index 0000000000..02fc321151 --- /dev/null +++ b/src/schema/linkml_design.md @@ -0,0 +1,56 @@ +We would like to rework our metaschema (src/metaschema.json) into linkml. That metaschema describes our "BIDS schema" which is arranged through yaml files under src/schema. src/schema/README.md provides more description about schema organization etc. + +Note that some constructs such as objects.metadata and objects.columns have types described in jsonschema. +Compiled schema (could be done via `uv run bst export` which outputs to stdout) should be validated with the converted metaschema. + +We would like to make it more "classy" as to define classes with linkml which would nicely translate into OOP e.g. in Python. +So then we could generate Python dataclasses to be used by `tools/schemacode/` python packages. +From linkml we would like also to generate TypeScript classes, for which an experimental branch (jsr-dist) was developed so check it out as well (e.g. in a worktree under .worktrees) + +We still want to use new metaschema in linkml to validate our BIDS schema located under src/schema which is compiled into src/schema.json. +If you need "sources" of linkml, its git repos are also available locally under /home/yoh/proj/misc/linkml . + +# Discoveries after round 1 + +Looking at state at v1.11.1-18-g1320e864f and diagram at http://127.0.0.1:8000/en/stable/schema/class_diagram.html + +(We will use D{round}.{index} for identifiers) + +### D1.1 DataProperties + +It feels that we need a construct (class?) to define smth like "DataProperties" (propose better name) to define + + - minimum: float + - maximum: float + - unit: str + + which defined at metaschema level and in principle can be used to hardcode in scheme associating with some suffixes e.g + + ❯ show-paths -e unit: -f full-lines src/schema/objects/suffixes.yaml + 18 Chimap: + 28: unit: ppm + ... + + ❯ show-paths -e 'max' -f full-lines src/schema/objects/suffixes.yaml + 144 MTRmap: + 155: maxValue: 100 + ... + + to describe associated data file (e.g. nii.gz) properties and then could be hardcoded for some .tsv columns + + ❯ show-paths -e '(unit|max.*):' -f full-lines src/schema/objects/columns.yaml + ... + 311 low_cutoff: + 318: unit: Hz + ... + 353 metabolite_polar_fraction: + 360: maximum: 1 + 418: unit: s + ... + + and then could be provided in the actual datasets for columns in corresponding .json for .tsv files; and for data in data sidecar .json files: + + ❯ show-paths -e 'maxim' -f full-lines src/schema/objects/metadata.yaml + ... + 2214 LabelingPulseFlipAngle: + 2222: maximum: 360 diff --git a/src/schema/linkml_design_plan.md b/src/schema/linkml_design_plan.md new file mode 100644 index 0000000000..6b4ababb01 --- /dev/null +++ b/src/schema/linkml_design_plan.md @@ -0,0 +1,241 @@ +# LinkML Metaschema Conversion — Plan + +## Current State + +| Layer | Current approach | +|-------|-----------------| +| **Metaschema** | `src/metaschema.json` — 742 lines of JSON Schema defining the shape of the BIDS schema | +| **Schema data** | ~90 YAML files under `src/schema/` organized as `meta/`, `objects/`, `rules/` | +| **Validation** | `bidsschematools.schema.get_schema_validator()` validates compiled schema against metaschema via `jsonschema` | +| **Python types** | `Namespace` dict-like wrapper — no real classes, no type safety | +| **TypeScript** | `jsr-dist` orphan branch uses `json-schema-to-typescript` to auto-generate types from JSON Schema | +| **LinkML tools** | Local repos at `/home/yoh/proj/misc/linkml/` for reference; install from PyPI via `uv pip install linkml` | + +## Scope + +**We are only replacing the metaschema** (`src/metaschema.json` → LinkML). +The BIDS schema YAML files under `src/schema/` themselves are not being modified. +The `$ref` conventions, expression DSL, and overall schema structure stay as-is. + +## Key Challenges + +1. **The BIDS metaschema isn't a typical data model.** It describes a schema-about-schemas. Some parts (e.g., `objects.metadata`, `objects.columns`) embed inline JSON Schema fragments (`type`, `anyOf`, `items`, `properties`, `additionalProperties`). LinkML will need to model this without losing expressiveness. + +2. **No `$ref` in scope.** The BIDS schema YAML files use a custom `$ref` convention, but these are fully dereferenced during compilation (`bst export`). The metaschema validates the **compiled** JSON output, which contains no `$ref` fields. The LinkML metaschema has no need to model `$ref` at all. + +3. **Expressions language.** `selectors` and `checks` are string-based DSL expressions. These are opaque strings from the metaschema's perspective — LinkML just needs to declare them as string lists, but long-term there may be value in formalizing them. + +4. **Heterogeneous rule shapes.** File rules (`pathRule`, `stemRule`, `suffixRule`), sidecar rules, tabular data rules, and check rules all have different shapes. These map well to LinkML classes with inheritance. + +## Phase 1: Modeling — Create the LinkML schema (`bids_metaschema.yaml`) + +The core work. Translate the concepts in `metaschema.json` into LinkML: + +- **Top-level class**: `BidsSchema` (with slots for `meta`, `objects`, `rules`, `bids_version`, `schema_version`) + +- **Object term types** as a class hierarchy: + - `GeneralTerm` (base: `display_name`, `description`) + - `NameValueTerm` extends `GeneralTerm` (adds `name`, `type`, `format`, `enum`, etc.) + - **`Entity`** — NameValueTerm with required `format` (label/index) + - **`MetadataField`** — NameValueTerm with JSON Schema value constraints, `unit`, `recommended` + - **`Column`** — NameValueTerm with optional `definition` (JSON object), `unit`, `format` + - `ValueTerm` extends `GeneralTerm` (adds `value`) + - **`Suffix`** — ValueTerm with optional `unit`, `anyOf`, `maxValue`, `minValue` + - **`Datatype`** — plain ValueTerm + - **`Extension`** — plain ValueTerm + - Independent classes (not subclasses of NameValue/Value): + - **`Format`** — GeneralTerm + required `pattern` + - **`FileObject`** — GeneralTerm + required `file_type` + - **`EnumValue`** — GeneralTerm + `value` + optional `tags` + +- **Rule types** as another hierarchy: + - `FileRule` (base for `PathRule`, `StemRule`, `SuffixRule`) + - `SidecarRule`, `TabularDataRule`, `CheckRule` + - `Issue`, `RequirementLevel` (enum) + +- **Meta types**: `Association`, `Context` (can remain as embedded JSON Schema) + +## Why the generated JSON Schema needs post-processing + +LinkML's JSON Schema generator (`gen-json-schema`) produces valid JSON Schema from the +LinkML model, but the BIDS metaschema relies heavily on a pattern that LinkML does not +natively express: **maps with arbitrary string keys and typed values**. +In JSON Schema this is `"additionalProperties": {"$ref": "#/$defs/SomeClass"}`. + +The post-processing script `patch_metaschema.py` bridges this gap. +The 43 patches fall into six categories: + +### Category 1: Simple typed maps (16 patches) + +Pattern: a slot whose value is `{arbitrary_key: TypedValue}`. + +Example: `ObjectsSection.entities` is a JSON object where every key is an entity name +(like `"subject"`, `"session"`) and every value conforms to the `Entity` class. +LinkML generates `"type": ["string", "null"]` for the slot (since it has no explicit range); +the patch replaces this with `"type": "object", "additionalProperties": {"$ref": "#/$defs/Entity"}`. + +Affected slots: `ObjectsSection.columns`, `.datatypes`, `.entities`, `.enums`, `.extensions`, +`.files`, `.formats`, `.metadata`, `.metaentities`, `.modalities`, `.suffixes`, +`.common_principles`; `RulesSection.dataset_metadata`, `.errors`, `.modalities`; +`MetaSection.associations`. + +### Category 2: Nested maps (9 patches) + +Pattern: two levels of arbitrary keys, `{groupName: {ruleName: TypedValue}}`. + +Example: `rules.checks` is `{anat: {T1wFileWithTooManyDimensions: CheckRule, ...}, ...}`. +The patch builds nested `additionalProperties`. + +Affected: `MetaSection.templates`, `RulesSection.checks`, `.directories`, `.json`, +`.sidecars`, `.tabular_data`; `FileRulesSection.common`, `.raw`, `.deriv`. + +### Category 3: Union-valued maps (4 patches) + +Pattern: a map where values can be either a plain string enum or an object. + +Example: `SuffixRule.entities` maps entity names to either a `RequirementLevel` string +(`"required"`, `"optional"`) or an `EntityOverride` object (`{level: "required", enum: [...]}`). +The patch uses `anyOf` in the `additionalProperties`. + +Affected: `SuffixRule.entities`, `SidecarRule.fields`, `TabularDataRule.columns`, +`Template.entities`. + +### Category 4: Open classes (8 patches) + +LinkML generates `"additionalProperties": false` by default, but some BIDS classes embed +arbitrary JSON Schema properties (for example, `MetadataField` can have `items`, `properties`, +`additionalProperties`, and other JSON Schema keywords not enumerated in the class definition). +The patch removes the `"additionalProperties": false` constraint from these classes. + +Affected: `MetadataField`, `Column`, `NameValueTerm`, `Suffix`, `EnumValue`, `ValueTerm`, +`DirectoryEntry`, `Template`. + +### Category 5: Slot type coercions (5 patches) + +Individual slots that need type flexibility beyond what LinkML expresses: + +- `MetaSection.context` — the value is an arbitrary JSON Schema object (the context definition) +- `AssociationTarget.extension` — can be a string or an array of strings +- `ExpressionTest.result` — can be any JSON value (string, number, null, boolean, etc.) +- `DirectoryEntry.subdirs` — array items can be strings or objects (with `oneOf`) +- `JsonSchema` — the entire class is an open container for any JSON Schema fragment + +### Category 6: Root reference (1 patch) + +Sets `$ref` to `BidsSchema` so the schema validates the top-level object. +This is already handled by `tree_root: true` in the LinkML model, but `gen-json-schema` +does not emit a top-level `$ref` currently. + +### LinkML `extra_slots` — implemented using PR 2940 + +LinkML's [`extra_slots`](https://linkml.io/linkml-model/dev/docs/extra_slots/) feature +maps to JSON Schema `additionalProperties`. +Using the draft implementation from linkml/linkml#2940 +(`sneakers-the-rat/linkml@jsonschema-extra`), we have adopted `extra_slots` throughout +the BIDS metaschema: + +```yaml +# Generates: "additionalProperties": {"anyOf": [{"$ref": "#/$defs/Entity"}, {"type": "null"}]} +EntityMap: + extra_slots: + range_expression: + range: Entity + +# Generates: "additionalProperties": true +MetadataField: + extra_slots: + allowed: true + +# Generates: "additionalProperties": {"anyOf": [{$ref: RequirementLevel}, {$ref: EntityOverride}, ...]} +EntityRequirementMap: + extra_slots: + range_expression: + any_of: + - range: RequirementLevel + - range: EntityOverride +``` + +**Results**: The patch script has been reduced from **43 patches to 8**: +- Categories 1-4 (35 patches) are fully handled by `extra_slots` +- Category 5 (5 slot-level type coercions) still requires patches +- Category 6 (root `$ref`) still requires a patch +- Special: sidecars/tabular_data derivatives nesting (2 patches) — heterogeneous + depth that cannot be expressed with simple `extra_slots` + +**29 wrapper classes** were added to model the typed maps: +- 13 simple map classes (Category 1): `EntityMap`, `ColumnMap`, etc. +- 12 nested map classes (Category 2): `CheckRuleGroupMap` → `CheckRuleMap`, etc. +- 3 union map classes (Category 3): `EntityRequirementMap`, `FieldRequirementMap`, `TemplateEntityMap` +- 1 open map class: `EnumMap` (mixed EnumValue/PrivateEnum values) + +### Divergences to report against PR 2940 + +1. **Spurious `{type: null}` in `additionalProperties`**: When `extra_slots` has a + `range_expression` pointing to a class, the generated `additionalProperties` wraps + the reference in `anyOf: [{$ref: X}, {type: null}]`. The null alternative is incorrect + for map values — it means map values can be null, which is not the intent. The fix + would be to call `get_subschema_for_slot(..., include_null=False)` in + `get_additional_properties()` (line 714 of `jsonschemagen.py`). + + This does NOT cause validation failures (the BIDS schema has no null map values), + but it makes the schema less strict than intended. + +2. **No issue with `allowed: true`**: Classes using `extra_slots: {allowed: true}` correctly + generate `additionalProperties: true`. + +3. **`any_of` in `range_expression` works correctly**: Union-valued maps like + `EntityRequirementMap` generate the expected `anyOf` with all alternatives, plus + the spurious null from issue #1. + +## Phase 2: Validation continuity — Generate JSON Schema from LinkML + +- Use `gen-json-schema bids_metaschema.yaml | python patch_metaschema.py` to produce + the final JSON Schema +- Verify it validates the compiled BIDS schema. The existing tooling already has what we need: + - `check-jsonschema` (already a test dependency) can validate `schema.json` against a schema file + - `bidsschematools` uses `jsonschema` library with `get_schema_validator()` — we just swap the metaschema source + - Concretely: `uv run bst export > /tmp/schema.json && check-jsonschema --schemafile generated_metaschema.json /tmp/schema.json` +- Compare coverage/strictness with current `metaschema.json` +- Iterate on the LinkML model until parity is achieved + +## Phase 3: Python code generation + +- Use `linkml gen-python` (or `gen-pydantic`) to produce typed Python classes +- Integrate into `bidsschematools` — replace the `Namespace` dict approach with proper dataclass loading +- **Namespace boundary consideration**: The `Namespace` class (`tools/schemacode/.../types/namespace.py`) provides recursive `.attribute` access during compilation/dereferencing. The generated typed classes need to either: + - Support the same attribute-style access (Pydantic models and dataclasses do this natively), or + - Serve as the "outer shell" with `Namespace` still used internally for the parts of the schema not described by the metaschema (e.g., the free-form JSON Schema fragments inside `objects.metadata` values) +- The schema loading path (`load_schema()` → dereference → validate) would return typed objects instead of nested dicts + +## Phase 4: TypeScript code generation + +- Use `linkml gen-typescript` to produce TypeScript interfaces +- Replace the current `json-schema-to-typescript` approach in `jsr-dist` +- No runtime library needed — the generated interfaces are pure type definitions with zero runtime dependencies, which is exactly what `jsr-dist` needs to type the compiled `schema.json` + +## Phase 5: CI and documentation + +- Replace `check-metaschema` pre-commit hook with LinkML-based validation +- Update `src/schema/README.md` (note: this README is part of the Sphinx docs for bidsschematools, so updates need to be consistent with that documentation build) +- Ensure `bst export-metaschema` outputs the generated JSON Schema for backwards compatibility + +## Class Hierarchy Diagram + +A Mermaid class diagram visualizing the 35 data model classes, 3 enums, +inheritance, and composition relationships is available in +[`class_diagram.md`](class_diagram.md). + +It is auto-generated from `bids_metaschema.yaml` by running: + +```bash +uv run python src/schema/gen_class_diagram.py +``` + +The diagram excludes the 29 map wrapper classes (names ending in "Map") +for clarity. Map-typed attributes are shown using `Map~ValueType~` notation +(e.g., `Map~Entity~` for a map with Entity values, `Map~Map~SuffixRule~~` +for a nested two-level map). + +## Suggested Starting Point + +Phase 1 is the critical path. Start with a minimal LinkML schema that models just the `objects.entities` and `objects.suffixes` sub-namespaces (they're the simplest — `GeneralTerm` + `NameValueTerm` / `ValueTerm`), then progressively add the more complex parts (`objects.metadata` with its embedded JSON Schema, then `rules.*`). diff --git a/src/schema/linkml_design_steps.md b/src/schema/linkml_design_steps.md new file mode 100644 index 0000000000..ac76ce9d06 --- /dev/null +++ b/src/schema/linkml_design_steps.md @@ -0,0 +1,54 @@ +# Phase 1 Implementation Steps + +## Step 1.1: Scaffold the LinkML schema with enums and base types + +Create `src/schema/bids_metaschema.yaml` with: +- Prefixes and schema metadata (id, name, license) +- `RequirementLevel` enum: `required`, `recommended`, `optional`, `deprecated` +- `FormatType` enum: all 17 values from current metaschema +- `IssueSeverity` enum: `error`, `warning` +- `GeneralTerm` class: slots `display_name` (required string), `description` (required string) + +## Step 1.2: Model the `objects.*` sub-namespaces + +- `ValueTerm` (extends `GeneralTerm`): add `value` (required string) — covers `datatypes`, `extensions` +- `Suffix` (extends `GeneralTerm` + `value`): add optional `unit`, `anyOf`, `maxValue`, `minValue` +- `NameValueTerm` (extends `GeneralTerm`): add `name` (required string), `type`, `format`, `enum` + with JSON Schema constraint fields (`maximum`, `minimum`, `anyOf`, `items`, `properties`, `additionalProperties`) +- `Entity`: NameValueTerm with required `format` +- `MetadataField`: NameValueTerm + `unit`, `recommended` +- `Column`: NameValueTerm + optional `definition` object, `unit` +- `Format`: GeneralTerm + required `pattern` +- `FileObject`: GeneralTerm + required `file_type` +- `EnumValue`: GeneralTerm + `value` + optional `tags` +- Container classes: `ObjectsCollection` tying them together with keyed maps + +## Step 1.3: Model the `rules.*` types + +- `Issue`: `code` (string), `message` (string), optional `level` (IssueSeverity) +- `SuffixRule`: `suffixes`, `extensions`, `entities` (map of entity to requirement), optional `datatypes`, `level`, `selectors` +- `PathRule`: `path`, `level` +- `StemRule`: `stem`, `extensions`, `level`, optional `datatypes`, `selectors` +- `SidecarRule`: `selectors`, `fields` (map of field name to requirement level or object with level/addendum/issue) +- `TabularDataRule`: `selectors`, `columns`, `additional_columns`, optional `initial_columns`, `index_columns` +- `CheckRule`: `selectors`, `checks`, `issue` +- Container classes for `rules.files`, `rules.sidecars`, `rules.tabular_data`, `rules.checks` + +## Step 1.4: Model the `meta.*` types + +- `Association`: `target` (with `entities`, `suffix`, `extension`), optional `selectors`, `inherit` +- `Context`: keep as free-form (JSON Schema embedded) +- `ExpressionTest`: `expression` (string), `result` (any) +- Ordering lists (`rules.entities`, `rules.common_principles`, `rules.metaentities`): string arrays + +## Step 1.5: Wire up the top-level `BidsSchema` class + +- Slots: `meta`, `objects`, `rules`, `bids_version` (string), `schema_version` (string) + +## Step 1.6: Validate round-trip + +- `linkml gen-json-schema bids_metaschema.yaml > /tmp/generated_metaschema.json` +- `uv run bst export > /tmp/schema.json` +- `check-jsonschema --schemafile /tmp/generated_metaschema.json /tmp/schema.json` +- Compare against `check-jsonschema --schemafile src/metaschema.json /tmp/schema.json` +- Iterate until parity diff --git a/src/schema/patch_metaschema.py b/src/schema/patch_metaschema.py new file mode 100644 index 0000000000..0ac38c1aed --- /dev/null +++ b/src/schema/patch_metaschema.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +"""Post-process LinkML-generated JSON Schema for BIDS metaschema. + +With the adoption of LinkML's extra_slots feature (PR linkml/linkml#2940), +the number of patches has been reduced from ~43 to ~6. + +The remaining patches cover: + - Category 5: Slot-level type coercions that extra_slots cannot express + - Category 6: Root $ref for the top-level schema object + - Special: sidecars/tabular_data derivatives nesting override + +Usage: + gen-json-schema bids_metaschema.yaml | python patch_metaschema.py > metaschema.json +""" + +from __future__ import annotations + +import json +import sys +from typing import Any + + +def patch(schema: dict[str, Any]) -> dict[str, Any]: + """Apply patches to generated JSON Schema.""" + defs: dict[str, Any] = schema.get("$defs", {}) + + # --- Categories 1-4 are now handled by extra_slots in the LinkML schema --- + # Category 1 (simple typed maps): wrapper classes with range_expression + # Category 2 (nested maps): nested wrapper classes + # Category 3 (union-valued maps): wrapper classes with any_of + # Category 4 (open classes): extra_slots: {allowed: true} + + # --- Category 5: Slot-level type coercions --- + + # MetaSection.context is an arbitrary JSON Schema object + meta = defs.get("MetaSection", {}) + meta_props: dict[str, Any] = meta.get("properties", {}) + if "context" in meta_props: + meta_props["context"] = { + "description": meta_props["context"].get("description", ""), + "type": "object", + } + + # AssociationTarget.extension can be a string or an array of strings + assoc_target = defs.get("AssociationTarget", {}) + at_props: dict[str, Any] = assoc_target.get("properties", {}) + if "extension" in at_props: + at_props["extension"] = { + "description": at_props["extension"].get("description", ""), + "anyOf": [ + {"type": "string"}, + {"type": "array", "items": {"type": "string"}}, + ], + } + + # ExpressionTest.result can be any JSON value + expr_test = defs.get("ExpressionTest", {}) + et_props: dict[str, Any] = expr_test.get("properties", {}) + if "result" in et_props: + et_props["result"] = { + "description": et_props["result"].get("description", ""), + } + + # JsonSchema is an open container for any JSON Schema fragment + if "JsonSchema" in defs: + defs["JsonSchema"] = { + "description": defs["JsonSchema"].get("description", ""), + } + + # DirectoryEntry.subdirs items can be strings or objects + dir_entry = defs.get("DirectoryEntry", {}) + de_props: dict[str, Any] = dir_entry.get("properties", {}) + if "subdirs" in de_props: + de_props["subdirs"] = { + "description": de_props["subdirs"].get("description", ""), + "type": "array", + "items": { + "anyOf": [ + {"type": "string"}, + {"type": "object"}, + ] + }, + } + + # --- Special: sidecars/tabular_data derivatives nesting --- + # The derivatives sub-group nests one more level than the regular + # groups. Override the generated schema with anyOf to accept both + # {RuleName: Rule} and {SubGroup: {RuleName: Rule}} at each group. + sidecar_rule_map: dict[str, Any] = { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/SidecarRule"}, + } + tabular_rule_map: dict[str, Any] = { + "type": "object", + "additionalProperties": {"$ref": "#/$defs/TabularDataRule"}, + } + rules_section = defs.get("RulesSection", {}) + rules_props: dict[str, Any] = rules_section.get("properties", {}) + if "sidecars" in rules_props: + rules_props["sidecars"] = { + "description": rules_props["sidecars"].get("description", ""), + "type": "object", + "additionalProperties": { + "anyOf": [ + sidecar_rule_map, + { + "type": "object", + "additionalProperties": sidecar_rule_map, + }, + ], + }, + } + if "tabular_data" in rules_props: + rules_props["tabular_data"] = { + "description": rules_props["tabular_data"].get("description", ""), + "type": "object", + "additionalProperties": { + "anyOf": [ + tabular_rule_map, + { + "type": "object", + "additionalProperties": tabular_rule_map, + }, + ], + }, + } + + # --- Category 6: Root $ref --- + schema["$ref"] = "#/$defs/BidsSchema" + + return schema + + +if __name__ == "__main__": + raw = json.load(sys.stdin) + patched = patch(raw) + json.dump(patched, sys.stdout, indent=2) + sys.stdout.write("\n") diff --git a/uv.lock b/uv.lock index c0caa55d10..619a1ff0fd 100644 --- a/uv.lock +++ b/uv.lock @@ -232,6 +232,7 @@ dependencies = [ { name = "mkdocs-branchcustomization-plugin" }, { name = "mkdocs-macros-plugin" }, { name = "mkdocs-material" }, + { name = "mkdocs-panzoom-plugin" }, { name = "mkdocs-redirects" }, { name = "numpy" }, { name = "pymdown-extensions" }, @@ -258,6 +259,7 @@ requires-dist = [ { name = "mkdocs-branchcustomization-plugin", specifier = "~=0.1.3" }, { name = "mkdocs-macros-plugin" }, { name = "mkdocs-material", specifier = ">=5.4" }, + { name = "mkdocs-panzoom-plugin", specifier = ">=0.5.2" }, { name = "mkdocs-redirects" }, { name = "numpy" }, { name = "pymdown-extensions", specifier = ">=9.2.0" }, @@ -1191,6 +1193,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, ] +[[package]] +name = "mkdocs-panzoom-plugin" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/03/0b75bf29b609daa544e852c3efee93181702ca68e46394895f61bd86f572/mkdocs_panzoom_plugin-0.5.2.tar.gz", hash = "sha256:f546356f5241fe5d499600be02a28cfe7d039c6f731d43f50eb6370065be2d3c", size = 22841, upload-time = "2025-12-22T15:46:58.929Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/58/370541865a451fc9e06f74d407f6d99581224455b5ffe5f55504a43ba6e4/mkdocs_panzoom_plugin-0.5.2-py3-none-any.whl", hash = "sha256:58746d7f7b8aef8787543a75b7a5ddcd04b67cfd6af964f25bbd9bc822ced68a", size = 22952, upload-time = "2025-12-22T15:46:57.123Z" }, +] + [[package]] name = "mkdocs-redirects" version = "1.2.2"