From 5ff8d221d72ea3dfee9db016e283e5de955c9867 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:07:26 -0800
Subject: [PATCH 01/20] ref: don't pass DataModel  into BlockLearners

It's not needed in there, so should just pass what is needed.
This is prep for further refactoring of removing datamodel more
---
 dedupe/labeler.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index 9598292c..5f19e304 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -212,7 +212,7 @@ def _filter_canopy_predicates(
 class DedupeBlockLearner(BlockLearner):
     def __init__(
         self,
-        data_model: DataModel,
+        candidate_predicates: Iterable[Predicate],
         data: Data,
         index_include: TrainingExamples,
     ):
@@ -224,7 +224,7 @@ def __init__(
         index_data = sample_records(data, 50000)
         sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
 
-        preds = _filter_canopy_predicates(data_model.predicates, canopies=True)
+        preds = _filter_canopy_predicates(candidate_predicates, canopies=True)
         self.block_learner = training.DedupeBlockLearner(
             preds, sampled_records, index_data
         )
@@ -262,7 +262,7 @@ def _sample(self, data: Data, sample_size: int) -> TrainingExamples:
 class RecordLinkBlockLearner(BlockLearner):
     def __init__(
         self,
-        data_model: DataModel,
+        candidate_predicates: Iterable[Predicate],
         data_1: Data,
         data_2: Data,
         index_include: TrainingExamples,
@@ -276,7 +276,7 @@ def __init__(
         index_data = sample_records(data_2, 50000)
         sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)
 
-        preds = _filter_canopy_predicates(data_model.predicates, canopies=False)
+        preds = _filter_canopy_predicates(candidate_predicates, canopies=False)
         self.block_learner = training.RecordLinkBlockLearner(
             preds, sampled_records_1, sampled_records_2, index_data
         )
@@ -394,7 +394,7 @@ def __init__(
         index_include = index_include.copy()
         index_include.append(exact_match)
 
-        self.blocker = DedupeBlockLearner(data_model, data, index_include)
+        self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include)
 
         self._candidates = self.blocker.candidates.copy()
 
@@ -429,7 +429,9 @@ def __init__(
         index_include = index_include.copy()
         index_include.append(exact_match)
 
-        self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include)
+        self.blocker = RecordLinkBlockLearner(
+            data_model.predicates, data_1, data_2, index_include
+        )
         self._candidates = self.blocker.candidates
 
         self.matcher = MatchLearner(self.data_model, self.candidates)

From 3f75472dff9c7afb3093aff96df7f778fc592e12 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:15:23 -0800
Subject: [PATCH 02/20] ref: don't store DataModel in DisagreementLearner

We don't use it after the initial construction, so don't store it
---
 dedupe/labeler.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index 5f19e304..0ab8d9ea 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -318,8 +318,7 @@ class DisagreementLearner(HasCandidates):
     matcher: MatchLearner
     blocker: BlockLearner
 
-    def __init__(self, data_model: DataModel) -> None:
-        self.data_model = data_model
+    def __init__(self) -> None:
         self.y: numpy.typing.NDArray[numpy.int_] = numpy.array([])
         self.pairs: TrainingExamples = []
 
@@ -381,8 +380,7 @@ def __init__(
         data: Data,
         index_include: TrainingExamples,
     ):
-        super().__init__(data_model)
-
+        super().__init__()
         data = core.index(data)
 
         random_pair = (
@@ -398,7 +396,7 @@ def __init__(
 
         self._candidates = self.blocker.candidates.copy()
 
-        self.matcher = MatchLearner(self.data_model, self.candidates)
+        self.matcher = MatchLearner(data_model, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]
@@ -413,8 +411,7 @@ def __init__(
         data_2: Data,
         index_include: TrainingExamples,
     ):
-        super().__init__(data_model)
-
+        super().__init__()
         data_1 = core.index(data_1)
 
         offset = len(data_1)
@@ -434,7 +431,7 @@ def __init__(
         )
         self._candidates = self.blocker.candidates
 
-        self.matcher = MatchLearner(self.data_model, self.candidates)
+        self.matcher = MatchLearner(data_model, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]

From 7aa69eab26709c6af87451d76a46c49a6b53630d Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:18:50 -0800
Subject: [PATCH 03/20] ref: Don't require DataModel in MatchLearner

We only need the abstract requirement of a Featurizer function.
---
 dedupe/_typing.py |  6 ++++++
 dedupe/labeler.py | 26 +++++++++-----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/dedupe/_typing.py b/dedupe/_typing.py
index 1ab25018..b2e6d7e7 100644
--- a/dedupe/_typing.py
+++ b/dedupe/_typing.py
@@ -80,6 +80,12 @@ class TrainingData(TypedDict):
     distinct: List[RecordDictPair]
 
 
+# Takes pairs of records and generates a (n_samples X n_features) array
+FeaturizerFunction = Callable[
+    [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float_]
+]
+
+
 class Classifier(Protocol):
     """Takes an array of pairwise distances and computes the likelihood they are a pair."""
 
diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index 0ab8d9ea..88f332ed 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -15,7 +15,7 @@
 if TYPE_CHECKING:
     from typing import Dict, Iterable, Literal, Mapping
 
-    from dedupe._typing import Data, Labels, LabelsLike
+    from dedupe._typing import Data, FeaturizerFunction, Labels, LabelsLike
     from dedupe._typing import RecordDictPair as TrainingExample
     from dedupe._typing import RecordDictPairs as TrainingExamples
     from dedupe._typing import RecordIDPair
@@ -70,33 +70,25 @@ def _verify_fit_args(pairs: TrainingExamples, y: LabelsLike) -> list[Literal[0,
 
 
 class MatchLearner(Learner):
-    def __init__(self, data_model: DataModel, candidates: TrainingExamples):
-        self.data_model = data_model
+    def __init__(self, featurizer: FeaturizerFunction, candidates: TrainingExamples):
+        self._featurizer = featurizer
         self._candidates = candidates.copy()
         self._classifier = sklearn.linear_model.LogisticRegression()
-        self._distances = self._calc_distances(self.candidates)
+        self._features = self._featurizer(self.candidates)
 
     def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None:
         y = self._verify_fit_args(pairs, y)
-        self._classifier.fit(self._calc_distances(pairs), numpy.array(y))
+        self._classifier.fit(self._featurizer(pairs), numpy.array(y))
         self._fitted = True
 
     def remove(self, index: int) -> None:
         self._candidates.pop(index)
-        self._distances = numpy.delete(self._distances, index, axis=0)
+        self._features = numpy.delete(self._features, index, axis=0)
 
     def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]:
         if not self._fitted:
             raise ValueError("Must call fit() before candidate_scores()")
-        scores: numpy.typing.NDArray[numpy.float_] = self._classifier.predict_proba(
-            self._distances
-        )[:, 1].reshape(-1, 1)
-        return scores
-
-    def _calc_distances(
-        self, pairs: TrainingExamples
-    ) -> numpy.typing.NDArray[numpy.float_]:
-        return self.data_model.distances(pairs)
+        return self._classifier.predict_proba(self._features)[:, 1].reshape(-1, 1)
 
 
 class BlockLearner(Learner):
@@ -396,7 +388,7 @@ def __init__(
 
         self._candidates = self.blocker.candidates.copy()
 
-        self.matcher = MatchLearner(data_model, self.candidates)
+        self.matcher = MatchLearner(data_model.distances, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]
@@ -431,7 +423,7 @@ def __init__(
         )
         self._candidates = self.blocker.candidates
 
-        self.matcher = MatchLearner(data_model, self.candidates)
+        self.matcher = MatchLearner(data_model.distances, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]

From 7e5633a34da0e1028a40f1404e9622368390e736 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:22:00 -0800
Subject: [PATCH 04/20] bugfix:? Copy candidate preds in
 RecordLinkDisagreementLearner

It is copied in DedupeDisagreementLearner, so I'm assuming that's a
good idea. There's surely no downside that I can see,
so might as well DEFINITELY avoid
a foot gun
---
 dedupe/labeler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index 88f332ed..b6fe17b0 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -421,7 +421,7 @@ def __init__(
         self.blocker = RecordLinkBlockLearner(
             data_model.predicates, data_1, data_2, index_include
         )
-        self._candidates = self.blocker.candidates
+        self._candidates = self.blocker.candidates.copy()
 
         self.matcher = MatchLearner(data_model.distances, self.candidates)
 

From 1d1b8714d31690db3949f80c4a43c6b621ed51af Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:38:29 -0800
Subject: [PATCH 05/20] ref: Remove use of DataModel from active learners

---
 dedupe/api.py         |  6 ++++--
 dedupe/labeler.py     | 15 ++++++++-------
 tests/test_labeler.py |  4 +++-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index dc54f7b3..713ee183 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -1325,7 +1325,8 @@ def prepare_training(
         examples, y = flatten_training(self.training_pairs)
 
         self.active_learner = labeler.DedupeDisagreementLearner(
-            self.data_model,
+            self.data_model.predicates,
+            self.data_model.distances,
             data,
             index_include=examples,
         )
@@ -1392,7 +1393,8 @@ def prepare_training(
         examples, y = flatten_training(self.training_pairs)
 
         self.active_learner = labeler.RecordLinkDisagreementLearner(
-            self.data_model,
+            self.data_model.predicates,
+            self.data_model.distances,
             data_1,
             data_2,
             index_include=examples,
diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index b6fe17b0..8c51dd69 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -19,7 +19,6 @@
     from dedupe._typing import RecordDictPair as TrainingExample
     from dedupe._typing import RecordDictPairs as TrainingExamples
     from dedupe._typing import RecordIDPair
-    from dedupe.datamodel import DataModel
     from dedupe.predicates import Predicate
 
 
@@ -368,7 +367,8 @@ def learn_predicates(
 class DedupeDisagreementLearner(DisagreementLearner):
     def __init__(
         self,
-        data_model: DataModel,
+        candidate_predicates: Iterable[Predicate],
+        featurizer: FeaturizerFunction,
         data: Data,
         index_include: TrainingExamples,
     ):
@@ -384,11 +384,11 @@ def __init__(
         index_include = index_include.copy()
         index_include.append(exact_match)
 
-        self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include)
+        self.blocker = DedupeBlockLearner(candidate_predicates, data, index_include)
 
         self._candidates = self.blocker.candidates.copy()
 
-        self.matcher = MatchLearner(data_model.distances, self.candidates)
+        self.matcher = MatchLearner(featurizer, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]
@@ -398,7 +398,8 @@ def __init__(
 class RecordLinkDisagreementLearner(DisagreementLearner):
     def __init__(
         self,
-        data_model: DataModel,
+        candidate_predicates: Iterable[Predicate],
+        featurizer: FeaturizerFunction,
         data_1: Data,
         data_2: Data,
         index_include: TrainingExamples,
@@ -419,11 +420,11 @@ def __init__(
         index_include.append(exact_match)
 
         self.blocker = RecordLinkBlockLearner(
-            data_model.predicates, data_1, data_2, index_include
+            candidate_predicates, data_1, data_2, index_include
         )
         self._candidates = self.blocker.candidates.copy()
 
-        self.matcher = MatchLearner(data_model.distances, self.candidates)
+        self.matcher = MatchLearner(featurizer, self.candidates)
 
         examples = [exact_match] * 4 + [random_pair]
         labels: Labels = [1] * 4 + [0]  # type: ignore[assignment]
diff --git a/tests/test_labeler.py b/tests/test_labeler.py
index 7f9c8df2..30609ffa 100644
--- a/tests/test_labeler.py
+++ b/tests/test_labeler.py
@@ -43,7 +43,9 @@ def test_AL(self):
             ({"name": "William", "age": "35"}, {"name": "Jimbo", "age": "21"}),
         ]
         EXPECTED_CANDIDATES = {freeze_record_pair(pair) for pair in EXPECTED_CANDIDATES}
-        active_learner = labeler.DedupeDisagreementLearner(self.data_model, SAMPLE, [])
+        active_learner = labeler.DedupeDisagreementLearner(
+            self.data_model.predicates, self.data_model.distances, SAMPLE, []
+        )
         actual_candidates = set()
         for i in range(len(EXPECTED_CANDIDATES), 0, -1):
             assert len(active_learner) == i

From 7350aedd0940c14d01d2c768caa7e31838629b0d Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 18:48:28 -0800
Subject: [PATCH 06/20] ref: Don't use DataModel in core

---
 dedupe/api.py      |  4 ++--
 dedupe/core.py     | 26 +++++++++++++-------------
 tests/test_core.py |  4 ++--
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index 713ee183..d2733e00 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -107,7 +107,7 @@ def score(self, pairs: RecordPairs) -> Scores:
         """
         try:
             matches = core.scoreDuplicates(
-                pairs, self.data_model, self.classifier, self.num_cores
+                pairs, self.data_model.distances, self.classifier, self.num_cores
             )
         except RuntimeError:
             raise RuntimeError(
@@ -824,7 +824,7 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]:
         """
 
         matches = core.scoreGazette(
-            blocks, self.data_model, self.classifier, self.num_cores
+            blocks, self.data_model.distances, self.classifier, self.num_cores
         )
 
         return matches
diff --git a/dedupe/core.py b/dedupe/core.py
index 7d739af5..64a74f66 100644
--- a/dedupe/core.py
+++ b/dedupe/core.py
@@ -34,6 +34,7 @@
         Classifier,
         ClosableJoinable,
         Data,
+        FeaturizerFunction,
         Literal,
         MapLike,
         RecordID,
@@ -41,7 +42,6 @@
         RecordPairs,
         Scores,
     )
-    from dedupe.datamodel import DataModel
 
     _Queue = Union[multiprocessing.dummy.Queue, multiprocessing.Queue]
 
@@ -53,7 +53,7 @@ class BlockingError(Exception):
 class ScoreDupes(object):
     def __init__(
         self,
-        data_model: DataModel,
+        featurizer: FeaturizerFunction,
         classifier: Classifier,
         records_queue: _Queue,
         exception_queue: _Queue,
@@ -61,7 +61,7 @@ def __init__(
         dtype: numpy.dtype,
         offset,
     ):
-        self.data_model = data_model
+        self.featurizer = featurizer
         self.classifier = classifier
         self.records_queue = records_queue
         self.exception_queue = exception_queue
@@ -87,8 +87,8 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None:
         if not records:
             return
 
-        distances = self.data_model.distances(records)
-        scores = self.classifier.predict_proba(distances)[:, -1]
+        features = self.featurizer(records)
+        scores = self.classifier.predict_proba(features)[:, -1]
 
         mask = scores > 0
         if not mask.any():
@@ -113,7 +113,7 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None:
 
 def scoreDuplicates(
     record_pairs: RecordPairs,
-    data_model: DataModel,
+    featurizer: FeaturizerFunction,
     classifier: Classifier,
     num_cores: int = 1,
 ) -> Scores:
@@ -145,7 +145,7 @@ def scoreDuplicates(
 
     n_map_processes = max(num_cores, 1)
     score_records = ScoreDupes(
-        data_model,
+        featurizer,
         classifier,
         record_pairs_queue,
         exception_queue,
@@ -200,15 +200,15 @@ def fillQueue(
 
 
 class ScoreGazette(object):
-    def __init__(self, data_model: DataModel, classifier: Classifier):
-        self.data_model = data_model
+    def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier):
+        self.featurizer = featurizer
         self.classifier = classifier
 
     def __call__(self, block: Block) -> Scores:
         record_ids, records = zip(*(zip(*each) for each in block))
 
-        distances = self.data_model.distances(records)
-        scores = self.classifier.predict_proba(distances)[:, -1]
+        features = self.featurizer(records)
+        scores = self.classifier.predict_proba(features)[:, -1]
 
         id_type = sniff_id_type(record_ids)
         ids = numpy.array(record_ids, dtype=id_type)
@@ -227,7 +227,7 @@ def __call__(self, block: Block) -> Scores:
 
 def scoreGazette(
     record_pairs: Blocks,
-    data_model: DataModel,
+    featurizer: FeaturizerFunction,
     classifier: Classifier,
     num_cores: int = 1,
 ) -> Generator[Scores, None, None]:
@@ -238,7 +238,7 @@ def scoreGazette(
 
     imap, pool = appropriate_imap(num_cores)
 
-    score_records = ScoreGazette(data_model, classifier)
+    score_records = ScoreGazette(featurizer, classifier)
 
     for scored_pairs in imap(score_records, record_pairs):
         yield scored_pairs
diff --git a/tests/test_core.py b/tests/test_core.py
index 2b4fc1c5..9754f154 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -70,7 +70,7 @@ def setUp(self):
 
     def test_score_duplicates(self):
         scores = dedupe.core.scoreDuplicates(
-            self.records, self.data_model, self.classifier, 2
+            self.records, self.data_model.distances, self.classifier, 2
         )
 
         numpy.testing.assert_equal(scores["pairs"], self.desired_scored_pairs["pairs"])
@@ -94,7 +94,7 @@ def test_score_duplicates_with_zeros(self):
         expected = numpy.array([(["3", "4"], 1)], dtype=dtype)
 
         scores = dedupe.core.scoreDuplicates(
-            records, self.data_model, self.classifier, 2
+            records, self.data_model.distances, self.classifier, 2
         )
 
         assert isinstance(scores, numpy.memmap)

From 3c204171d1c106580a7de181ff04d0ecbc5cfa53 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 11:18:47 -0800
Subject: [PATCH 07/20] ref: remove __len__ from DataModel

It isn't used publicly anywhere,
and it adds more API surface area to worry about. This is still
backwards compatible, as self._len
is still saved/restored
byt pickle the same way, now we just access
it directly instead of going via __len__().
---
 dedupe/datamodel.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index bb433565..a1d50d0d 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -50,9 +50,6 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]):
 
         self._len = len(all_variables)
 
-    def __len__(self) -> int:
-        return self._len
-
     # Changing this from a property to just a normal attribute causes
     # pickling problems, because we are removing static methods from
     # their class context. This could be fixed by defining comparators
@@ -82,7 +79,7 @@ def distances(
     ) -> numpy.typing.NDArray[numpy.float_]:
         num_records = len(record_pairs)
 
-        distances = numpy.empty((num_records, len(self)), "f4")
+        distances = numpy.empty((num_records, self._len), "f4")
 
         for i, (record_1, record_2) in enumerate(record_pairs):
 

From 25f91a5f1944c222dd628ed1afa811c174da7bce Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 11:47:02 -0800
Subject: [PATCH 08/20] ref: make typifying variables more clear

---
 dedupe/datamodel.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index a1d50d0d..b107fb27 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -38,8 +38,8 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]):
         variable_definitions = list(variable_definitions)
         if not variable_definitions:
             raise ValueError("The variable definitions cannot be empty")
-        all_variables: list[Variable]
-        self.primary_variables, all_variables = typify_variables(variable_definitions)
+        self.primary_variables = typify_variables(variable_definitions)
+        all_variables = _expand_higher_variables(self.primary_variables)
         self._derived_start = len(all_variables)
 
         all_variables += interactions(variable_definitions, self.primary_variables)
@@ -141,9 +141,8 @@ def __setstate__(self, d):
 
 def typify_variables(
     variable_definitions: Iterable[VariableDefinition],
-) -> tuple[list[FieldVariable], list[Variable]]:
-    primary_variables: list[FieldVariable] = []
-    all_variables: list[Variable] = []
+) -> list[FieldVariable]:
+    variables: list[FieldVariable] = []
     only_custom = True
 
     for definition in variable_definitions:
@@ -188,13 +187,7 @@ def typify_variables(
         variable_object = variable_class(definition)
         assert isinstance(variable_object, FieldVariable)
 
-        primary_variables.append(variable_object)
-
-        if hasattr(variable_object, "higher_vars"):
-            all_variables.extend(variable_object.higher_vars)
-        else:
-            variable_object = cast(Variable, variable_object)
-            all_variables.append(variable_object)
+        variables.append(variable_object)
 
     if only_custom:
         raise ValueError(
@@ -203,7 +196,17 @@ def typify_variables(
             "blocking rules"
         )
 
-    return primary_variables, all_variables
+    return variables
+
+
+def _expand_higher_variables(variables: Iterable[Variable]) -> list[Variable]:
+    result: list[Variable] = []
+    for variable in variables:
+        if hasattr(variable, "higher_vars"):
+            result.extend(variable.higher_vars)
+        else:
+            result.append(variable)
+    return result
 
 
 def missing(variables: list[Variable]) -> list[MissingDataType]:

From b2234e83e9528b1531596068ea8cf615cf1fc6be Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 12:47:33 -0800
Subject: [PATCH 09/20] ref: tweak error raising in typify_variables()

---
 dedupe/datamodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index b107fb27..2249dbbd 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -179,9 +179,9 @@ def typify_variables(
         try:
             variable_class = VARIABLE_CLASSES[variable_type]
         except KeyError:
+            valid = ", ".join(VARIABLE_CLASSES)
             raise KeyError(
-                "Field type %s not valid. Valid types include %s"
-                % (definition["type"], ", ".join(VARIABLE_CLASSES))
+                f"Variable type {variable_type} not valid. Valid types include {valid}"
             )
 
         variable_object = variable_class(definition)

From 13b9f5061f29e6e428d401305fa1f67a7c02ae03 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 12:50:59 -0800
Subject: [PATCH 10/20] ref: Clarify only_custom logic in typify_variables

---
 dedupe/datamodel.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 2249dbbd..83fbeae9 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -8,6 +8,7 @@
 import numpy
 
 import dedupe.variables
+from dedupe.variables.base import CustomType
 from dedupe.variables.base import FieldType as FieldVariable
 from dedupe.variables.base import MissingDataType, Variable
 from dedupe.variables.interaction import InteractionType
@@ -143,8 +144,6 @@ def typify_variables(
     variable_definitions: Iterable[VariableDefinition],
 ) -> list[FieldVariable]:
     variables: list[FieldVariable] = []
-    only_custom = True
-
     for definition in variable_definitions:
         try:
             variable_type = definition["type"]
@@ -163,9 +162,6 @@ def typify_variables(
                 "{'field' : 'Phone', type: 'String'}"
             )
 
-        if variable_type != "Custom":
-            only_custom = False
-
         if variable_type == "Interaction":
             continue
 
@@ -189,6 +185,7 @@ def typify_variables(
 
         variables.append(variable_object)
 
+    only_custom = all(isinstance(v, CustomType) for v in variables)
     if only_custom:
         raise ValueError(
             "At least one of the variable types needs to be a type"

From 78ef7ffb4766074b0926ff967ffada1fecba4b62 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 12:55:16 -0800
Subject: [PATCH 11/20] typ: Fix typing of typify_variables()

We iterate through the input more than once, so an Iterable
is insufficient.
---
 dedupe/datamodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 83fbeae9..269ca576 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -141,7 +141,7 @@ def __setstate__(self, d):
 
 
 def typify_variables(
-    variable_definitions: Iterable[VariableDefinition],
+    variable_definitions: list[VariableDefinition],
 ) -> list[FieldVariable]:
     variables: list[FieldVariable] = []
     for definition in variable_definitions:

From 77b4192bd6cf854f2d2efc923ef9ae614d3585c1 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 13:12:54 -0800
Subject: [PATCH 12/20] ref: Further rename field to variable in datamodel

---
 dedupe/datamodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 269ca576..bbca107a 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -217,13 +217,13 @@ def missing(variables: list[Variable]) -> list[MissingDataType]:
 def interactions(
     definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable]
 ) -> list[InteractionType]:
-    field_d = {field.name: field for field in primary_variables}
+    var_d = {var.name: var for var in primary_variables}
 
     interactions = []
     for definition in definitions:
         if definition["type"] == "Interaction":
             var = InteractionType(definition)
-            var.expandInteractions(field_d)
+            var.expandInteractions(var_d)
             interactions.extend(var.higher_vars)
     return interactions
 

From 497badc2ef51caf20d71223c1f435c4f11de20d4 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 13:46:38 -0800
Subject: [PATCH 13/20] test: Add more tests for interaction variables

---
 tests/test_api.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/test_api.py b/tests/test_api.py
index 84ac9169..32458a17 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -58,11 +58,13 @@ def test_initialize_fields(self):
                 [],
             )
 
+        # only customs
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
                 [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}],
             )
 
+        # Only customs
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
                 [
@@ -71,6 +73,24 @@ def test_initialize_fields(self):
                 ],
             )
 
+        # Only custom and interactions
+        with self.assertRaises(ValueError):
+            dedupe.api.ActiveMatching(
+                [
+                    {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
+                    {"field": "age", "type": "Custom", "comparator": lambda x, y: 1},
+                    {"type": "Interaction", "interaction variables": ["name", "age"]},
+                ],
+            )
+
+        # Only interactions
+        with self.assertRaises(ValueError):
+            dedupe.api.ActiveMatching(
+                [
+                    {"type": "Interaction", "interaction variables": []},
+                ],
+            )
+
         dedupe.api.ActiveMatching(
             [
                 {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
@@ -78,6 +98,19 @@ def test_initialize_fields(self):
             ],
         )
 
+        dedupe.api.ActiveMatching(
+            [
+                {"field": "name", "variable name": "name", "type": "String"},
+                {
+                    "field": "age",
+                    "variable name": "age",
+                    "type": "Custom",
+                    "comparator": lambda x, y: 1,
+                },
+                {"type": "Interaction", "interaction variables": ["name", "age"]},
+            ],
+        )
+
     def test_check_record(self):
         matcher = dedupe.api.ActiveMatching(self.field_definition)
 

From c04981d7bb54a7809ea3d7dff09c0e8c98bb3bff Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 14:43:50 -0800
Subject: [PATCH 14/20] Ensure unique variable names

Before, we never checked for this,
so when we called list.index() we
just got back the first instance.

I swapped us over to a lookup table because that makes more
sense, but I wanted to also add this check because if someone had
duplicate names, this would quitely change the behavior underneath
someone: before it gave the first index, now it gives the last.
---
 dedupe/datamodel.py | 16 ++++++++++++++--
 tests/test_api.py   | 20 ++++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index bbca107a..8f5b4108 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -233,15 +233,27 @@ def missing_field_indices(variables: list[Variable]) -> list[int]:
 
 
 def interaction_indices(variables: list[Variable]) -> list[list[int]]:
-    var_names = [var.name for var in variables]
+    _ensure_unique_names(variables)
+    name_to_index = {var.name: i for i, var in enumerate(variables)}
     indices = []
     for var in variables:
         if hasattr(var, "interaction_fields"):
-            interaction_indices = [var_names.index(f) for f in var.interaction_fields]  # type: ignore
+            interaction_indices = [name_to_index[f] for f in var.interaction_fields]  # type: ignore
             indices.append(interaction_indices)
     return indices
 
 
+def _ensure_unique_names(variables: Iterable[Variable]) -> None:
+    seen = set()
+    for var in variables:
+        if var.name in seen:
+            raise ValueError(
+                "Variable name used more than once! "
+                "Choose a unique name for each variable: '{var.name}'"
+            )
+        seen.add(var.name)
+
+
 def reduce_method(m):  # type: ignore[no-untyped-def]
     return (getattr, (m.__self__, m.__func__.__name__))
 
diff --git a/tests/test_api.py b/tests/test_api.py
index 32458a17..6f9f40dd 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -91,6 +91,26 @@ def test_initialize_fields(self):
                 ],
             )
 
+        # Duplicate variable names (explicitly)
+        with self.assertRaises(ValueError) as e:
+            dedupe.api.ActiveMatching(
+                [
+                    {"field": "age", "type": "String", "variable name": "my_age"},
+                    {"field": "age", "type": "ShortString", "variable name": "my_age"},
+                ],
+            )
+        assert "Variable name used more than once!" in str(e.exception)
+
+        # Duplicate variable names (implicitly)
+        with self.assertRaises(ValueError) as e:
+            dedupe.api.ActiveMatching(
+                [
+                    {"field": "age", "type": "String"},
+                    {"field": "age", "type": "String"},
+                ],
+            )
+        assert "Variable name used more than once!" in str(e.exception)
+
         dedupe.api.ActiveMatching(
             [
                 {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},

From 3e55496e4fabe5ca1845dd84343f6b38c9ada34f Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 14:52:35 -0800
Subject: [PATCH 15/20] ref: Remove "type" from non-exposed variables

These variables
1. you can't currently create them because they inherit from Variable,
not FieldVariable, so they don't appear in the
datamodel.VARIABLE_CLASSES
lookup table
2. You SHOULDN'T be able to instantiate these variables directly
from a variable definition
---
 dedupe/variables/base.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py
index f80b28fa..c958db5e 100644
--- a/dedupe/variables/base.py
+++ b/dedupe/variables/base.py
@@ -58,21 +58,16 @@ def all_subclasses(
 
 
 class DerivedType(Variable):
-    type = "Derived"
-
     def __init__(self, definition: VariableDefinition):
         self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"]))
         super(DerivedType, self).__init__(definition)
 
 
 class MissingDataType(Variable):
-    type = "MissingData"
+    has_missing = False
 
     def __init__(self, name: str):
-
-        self.name = "(%s: Not Missing)" % name
-
-        self.has_missing = False
+        self.name = f"({name}: Not Missing)"
 
 
 class FieldType(Variable):

From f32f511b41089886f8075b4a01fbda24c5ff917f Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 15:29:01 -0800
Subject: [PATCH 16/20] ref: Create InteractionVariables the same as the rest

Before the flow was:
1. create all the primary variables EXCEPT for the interactions
2. Expand those.
3. go back through and NOW create the InteractionVariables

I found this confusing. We parse the var definitions twice in separate
places, and InteractionVariables
are a special case in the first pass.

The other point of this is to make it so that there
is one list of Variable instances
which is the single source of truth. Once we do this
then we can turn all the other private instance variables of DataModel
into @functools.cached_property's,
which will make it much easier to
ensure pickle compatibility in
the future, because
only one variable needs to get
saved and restored
---
 dedupe/datamodel.py | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 8f5b4108..c4962b87 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -29,7 +29,7 @@
     )
     from dedupe.predicates import Predicate
 
-VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}
+VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k}
 
 
 class DataModel(object):
@@ -39,11 +39,16 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]):
         variable_definitions = list(variable_definitions)
         if not variable_definitions:
             raise ValueError("The variable definitions cannot be empty")
-        self.primary_variables = typify_variables(variable_definitions)
-        all_variables = _expand_higher_variables(self.primary_variables)
-        self._derived_start = len(all_variables)
-
-        all_variables += interactions(variable_definitions, self.primary_variables)
+        variables = typify_variables(variable_definitions)
+        non_interactions: list[FieldVariable] = [
+            v for v in variables if not isinstance(v, InteractionType)  # type: ignore[misc]
+        ]
+        self.primary_variables = non_interactions
+        expanded_primary = _expand_higher_variables(self.primary_variables)
+        self._derived_start = len(expanded_primary)
+
+        all_variables = expanded_primary.copy()
+        all_variables += _expanded_interactions(variables)
         all_variables += missing(all_variables)
 
         self._missing_field_indices = missing_field_indices(all_variables)
@@ -140,10 +145,8 @@ def __setstate__(self, d):
         self.__dict__ = d
 
 
-def typify_variables(
-    variable_definitions: list[VariableDefinition],
-) -> list[FieldVariable]:
-    variables: list[FieldVariable] = []
+def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]:
+    variables: list[Variable] = []
     for definition in variable_definitions:
         try:
             variable_type = definition["type"]
@@ -162,9 +165,6 @@ def typify_variables(
                 "{'field' : 'Phone', type: 'String'}"
             )
 
-        if variable_type == "Interaction":
-            continue
-
         if variable_type == "FuzzyCategorical" and "other fields" not in definition:
             definition["other fields"] = [  # type: ignore
                 d["field"]
@@ -181,11 +181,11 @@ def typify_variables(
             )
 
         variable_object = variable_class(definition)
-        assert isinstance(variable_object, FieldVariable)
+        assert isinstance(variable_object, Variable)
 
         variables.append(variable_object)
 
-    only_custom = all(isinstance(v, CustomType) for v in variables)
+    only_custom = all(isinstance(v, (CustomType, InteractionType)) for v in variables)
     if only_custom:
         raise ValueError(
             "At least one of the variable types needs to be a type"
@@ -214,16 +214,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]:
     return missing_variables
 
 
-def interactions(
-    definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable]
-) -> list[InteractionType]:
-    var_d = {var.name: var for var in primary_variables}
-
+def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]:
+    field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)}
     interactions = []
-    for definition in definitions:
-        if definition["type"] == "Interaction":
-            var = InteractionType(definition)
-            var.expandInteractions(var_d)
+    for var in variables:
+        if isinstance(var, InteractionType):
+            var.expandInteractions(field_vars)
             interactions.extend(var.higher_vars)
     return interactions
 

From e169cc221153bc524b03432a475a6d55aea42234 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Fri, 16 Sep 2022 15:41:26 -0800
Subject: [PATCH 17/20] ref: Move check for empty var def into typify_variables

---
 dedupe/datamodel.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index c4962b87..053b8ee1 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -36,9 +36,6 @@ class DataModel(object):
     version = 1
 
     def __init__(self, variable_definitions: Iterable[VariableDefinition]):
-        variable_definitions = list(variable_definitions)
-        if not variable_definitions:
-            raise ValueError("The variable definitions cannot be empty")
         variables = typify_variables(variable_definitions)
         non_interactions: list[FieldVariable] = [
             v for v in variables if not isinstance(v, InteractionType)  # type: ignore[misc]
@@ -145,7 +142,13 @@ def __setstate__(self, d):
         self.__dict__ = d
 
 
-def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]:
+def typify_variables(
+    variable_definitions: Iterable[VariableDefinition],
+) -> list[Variable]:
+    variable_definitions = list(variable_definitions)
+    if not variable_definitions:
+        raise ValueError("The variable definitions cannot be empty")
+
     variables: list[Variable] = []
     for definition in variable_definitions:
         try:
@@ -179,10 +182,8 @@ def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Var
             raise KeyError(
                 f"Variable type {variable_type} not valid. Valid types include {valid}"
             )
-
         variable_object = variable_class(definition)
         assert isinstance(variable_object, Variable)
-
         variables.append(variable_object)
 
     only_custom = all(isinstance(v, (CustomType, InteractionType)) for v in variables)

From c181763010bb3526d0bfd6b767b7c367de368170 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 12:32:36 -0800
Subject: [PATCH 18/20] Move _load_settings() into separate function

This will make the init method cleaner and allow us to more easily add
versioning.

_load_settings will read the settings file, and for future versions
of the class, will parse and convert
the loaded data structures to
the canonical in-memory
representation that the class expects. See following commits
---
 dedupe/api.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index d2733e00..39009be9 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -978,10 +978,25 @@ def __init__(
         """
         super().__init__(num_cores, in_memory, **kwargs)
 
+        self.data_model, self.classifier, self.predicates = self._load_settings(
+            settings_file
+        )
+
+        logger.info("Predicate set:")
+        for predicate in self.predicates:
+            logger.info(predicate)
+
+        self._fingerprinter = blocking.Fingerprinter(self.predicates)
+
+    @staticmethod
+    def _load_settings(
+        settings_file: BinaryIO,
+    ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]:
         try:
-            self.data_model = pickle.load(settings_file)
-            self.classifier = pickle.load(settings_file)
-            self.predicates = pickle.load(settings_file)
+            data_model = pickle.load(settings_file)
+            classifier = pickle.load(settings_file)
+            predicates = pickle.load(settings_file)
+            return data_model, classifier, predicates
         except (KeyError, AttributeError):
             raise SettingsFileLoadingException(
                 "This settings file is not compatible with "
@@ -1007,12 +1022,6 @@ def __init__(
                 "Try deleting the file"
             )
 
-        logger.info("Predicate set:")
-        for predicate in self.predicates:
-            logger.info(predicate)
-
-        self._fingerprinter = blocking.Fingerprinter(self.predicates)
-
 
 class ActiveMatching(Matching):
     """

From 241c0a65ef555cc36a03ef13ae612aec7c3c999b Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 17:45:04 -0800
Subject: [PATCH 19/20] ref: Add versioning to _load_settings()

---
 dedupe/api.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index 39009be9..b1059638 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -14,6 +14,7 @@
 import sqlite3
 import tempfile
 import warnings
+from io import BytesIO
 from typing import TYPE_CHECKING, cast
 
 import numpy
@@ -988,15 +989,23 @@ def __init__(
 
         self._fingerprinter = blocking.Fingerprinter(self.predicates)
 
-    @staticmethod
+    @classmethod
     def _load_settings(
-        settings_file: BinaryIO,
+        cls, settings_file: BinaryIO
     ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]:
+        # Make a copy so we can read it multiple times
+        settings_file = BytesIO(settings_file.read())
+        settings_file.seek(0)
         try:
-            data_model = pickle.load(settings_file)
-            classifier = pickle.load(settings_file)
-            predicates = pickle.load(settings_file)
-            return data_model, classifier, predicates
+            version = pickle.load(settings_file)
+            if not isinstance(version, int):
+                settings_file.seek(0)
+                return cls._load_settings_v0(settings_file)
+            else:
+                raise SettingsFileLoadingException(
+                    "Settings file version {} not understood".format(version)
+                )
+
         except (KeyError, AttributeError):
             raise SettingsFileLoadingException(
                 "This settings file is not compatible with "
@@ -1022,6 +1031,15 @@ def _load_settings(
                 "Try deleting the file"
             )
 
+    @staticmethod
+    def _load_settings_v0(
+        settings_file: BinaryIO,
+    ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]:
+        data_model = pickle.load(settings_file)
+        classifier = pickle.load(settings_file)
+        predicates = pickle.load(settings_file)
+        return data_model, classifier, predicates
+
 
 class ActiveMatching(Matching):
     """

From 83a2e8927124e39321e44e3404034f630396740f Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 15 Sep 2022 17:48:21 -0800
Subject: [PATCH 20/20] ref: improve settings loading exception reporting

Deduplicate the one exception we make.

Also now instead of just eating the catchall exception, we raise from it
so that you have slightly more insight into its cause
---
 dedupe/api.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index b1059638..83168367 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -996,6 +996,10 @@ def _load_settings(
         # Make a copy so we can read it multiple times
         settings_file = BytesIO(settings_file.read())
         settings_file.seek(0)
+        catchall_exception = SettingsFileLoadingException(
+            "Something has gone wrong with loading the settings file. "
+            "Try deleting the file"
+        )
         try:
             version = pickle.load(settings_file)
             if not isinstance(version, int):
@@ -1021,15 +1025,9 @@ def _load_settings(
                     "install that library: `pip install rlr`"
                 )
             else:
-                raise SettingsFileLoadingException(
-                    "Something has gone wrong with loading the settings file. "
-                    "Try deleting the file"
-                ) from exc
-        except:  # noqa: E722
-            raise SettingsFileLoadingException(
-                "Something has gone wrong with loading the settings file. "
-                "Try deleting the file"
-            )
+                raise
+        except Exception as exc:
+            raise catchall_exception from exc
 
     @staticmethod
     def _load_settings_v0(