Skip to content

Commit 2246cd8

Browse files
authored
SCHEMA: Add file rule for phenotype tables (#1672)
* TEST: Add example to test phenotypic data * SCHEMA: Add file rule for phenotype * ENH: Update stem rule to accept datatypes, glob stems * TEST: Update expected regexes for stem rules * TEST: Make phenotype an exception for now * feat(metaschema): Allow datatypes in stem rules * feat(regex): Capture paths and stems, and match string ends
1 parent 1d929e5 commit 2246cd8

6 files changed

Lines changed: 58 additions & 11 deletions

File tree

src/metaschema.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,10 @@
751751
"type": "object",
752752
"properties": {
753753
"level": { "enum": ["optional", "recommended", "required"] },
754+
"datatypes": {
755+
"type": "array",
756+
"items": { "pattern": "^[a-z]+$" }
757+
},
754758
"stem": { "type": "string" },
755759
"extensions": { "type": "array", "items": { "type": "string" } }
756760
},

src/schema/rules/files/common/tables.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,16 @@ sessions: # This file may only exist if session is present in the dataset.
3030
- .json
3131
entities:
3232
subject: required
33+
34+
# Phenotype is a special case where there are no applicable entities, but a
35+
# parent directory is specified. This most closely matches datatype in the current
36+
# structure. We also require a stem that can match any value, as there are no
37+
# constraints on the filename except extension.
38+
phenotype:
39+
level: optional
40+
datatypes:
41+
- phenotype
42+
stem: '*'
43+
extensions:
44+
- .tsv
45+
- .json

tools/schemacode/bidsschematools/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"qmri_tb1tfl", # fmap, _TB1TFL
2626
"qmri_vfa", # derivatives
2727
"ds000248", # .bidsignore
28+
"fnirs_automaticity", # phenotypic
2829
]
2930
# Errors are described in the README of the respective datasets:
3031
# https://github.com/bids-standard/bids-error-examples

tools/schemacode/bidsschematools/data/tests/test_rules.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ def test_rule_objects(schema_obj):
8787

8888
# Build a list of items mentioned in rules, but not found in objects.
8989
if use not in object_values:
90+
if (use, object_type) == ("phenotype", "datatypes"):
91+
# Special case: phenotype is a top-level directory
92+
# that acts like a datatype, but we don't want to
93+
# define it that way in the glossary, currently.
94+
continue
9095
temp_path = path[:]
9196
if is_list:
9297
temp_path[-1] += f"[{i_use}]"

tools/schemacode/bidsschematools/rules.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
``schema.rules.files``.
55
"""
66

7+
import fnmatch
78
import re
89
import typing as ty
910
from collections.abc import Mapping
@@ -125,7 +126,7 @@ def _entity_rule(rule: Mapping, schema: bst.types.Namespace):
125126
ext_regex = f"(?P<extension>{ext_match})"
126127

127128
return {
128-
"regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex]),
129+
"regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex, r"\Z"]),
129130
"mandatory": False,
130131
}
131132

@@ -170,15 +171,24 @@ def _sanitize_extension(ext: str) -> str:
170171

171172

172173
def _stem_rule(rule: bst.types.Namespace):
173-
stem_regex = re.escape(rule.stem)
174+
# translate includes a trailing \Z (end of string) but we expect extensions
175+
stem_match = fnmatch.translate(rule.stem)[:-2]
176+
stem_regex = f"(?P<stem>{stem_match})"
177+
178+
dtypes = set(rule.get("datatypes", ()))
179+
dir_regex = f"(?P<datatype>{'|'.join(dtypes)})/" if dtypes else ""
180+
174181
ext_match = "|".join(_sanitize_extension(ext) for ext in rule.extensions)
175-
ext_regex = f"(?P<extension>{ext_match})"
182+
ext_regex = rf"(?P<extension>{ext_match})\Z"
176183

177-
return {"regex": stem_regex + ext_regex, "mandatory": rule.level == "required"}
184+
return {"regex": dir_regex + stem_regex + ext_regex, "mandatory": rule.level == "required"}
178185

179186

180187
def _path_rule(rule: bst.types.Namespace):
181-
return {"regex": re.escape(rule.path), "mandatory": rule.level == "required"}
188+
path_match = re.escape(rule.path)
189+
# Exact path matches may be files or opaque directories
190+
# Consider using rules.directories to identify opaque directories
191+
return {"regex": rf"(?P<path>{path_match})(?:/.*)?\Z", "mandatory": rule.level == "required"}
182192

183193

184194
def regexify_filename_rules(

tools/schemacode/bidsschematools/tests/test_rules.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_entity_rule(schema_obj):
2121
r"sub-(?P=subject)_"
2222
r"(?:ses-(?P=session)_)?"
2323
r"(?P<suffix>T1w)"
24-
r"(?P<extension>\.nii)"
24+
r"(?P<extension>\.nii)\Z"
2525
),
2626
"mandatory": False,
2727
}
@@ -43,7 +43,7 @@ def test_entity_rule(schema_obj):
4343
r"(?:sub-(?P=subject)_)?"
4444
r"(?:ses-(?P=session)_)?"
4545
r"(?P<suffix>T1w)"
46-
r"(?P<extension>\.json)"
46+
r"(?P<extension>\.json)\Z"
4747
),
4848
"mandatory": False,
4949
}
@@ -84,28 +84,42 @@ def test_split_inheritance_rules():
8484
def test_stem_rule():
8585
rule = Namespace.build({"stem": "README", "level": "required", "extensions": ["", ".md"]})
8686
assert rules._stem_rule(rule) == {
87-
"regex": r"README(?P<extension>|\.md)",
87+
"regex": r"(?P<stem>(?s:README))(?P<extension>|\.md)\Z",
8888
"mandatory": True,
8989
}
9090

9191
rule = Namespace.build(
9292
{"stem": "participants", "level": "optional", "extensions": [".tsv", ".json"]}
9393
)
9494
assert rules._stem_rule(rule) == {
95-
"regex": r"participants(?P<extension>\.tsv|\.json)",
95+
"regex": r"(?P<stem>(?s:participants))(?P<extension>\.tsv|\.json)\Z",
96+
"mandatory": False,
97+
}
98+
99+
# Wildcard stem, with datatype
100+
rule = Namespace.build(
101+
{
102+
"stem": "*",
103+
"datatypes": ["phenotype"],
104+
"level": "optional",
105+
"extensions": [".tsv", ".json"],
106+
}
107+
)
108+
assert rules._stem_rule(rule) == {
109+
"regex": r"(?P<datatype>phenotype)/(?P<stem>(?s:.*))(?P<extension>\.tsv|\.json)\Z",
96110
"mandatory": False,
97111
}
98112

99113

100114
def test_path_rule():
101115
rule = Namespace.build({"path": "dataset_description.json", "level": "required"})
102116
assert rules._path_rule(rule) == {
103-
"regex": r"dataset_description\.json",
117+
"regex": r"(?P<path>dataset_description\.json)(?:/.*)?\Z",
104118
"mandatory": True,
105119
}
106120

107121
rule = Namespace.build({"path": "LICENSE", "level": "optional"})
108-
assert rules._path_rule(rule) == {"regex": "LICENSE", "mandatory": False}
122+
assert rules._path_rule(rule) == {"regex": r"(?P<path>LICENSE)(?:/.*)?\Z", "mandatory": False}
109123

110124

111125
def test_regexify_all():

0 commit comments

Comments
 (0)