From 5ff8d221d72ea3dfee9db016e283e5de955c9867 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:07:26 -0800 Subject: [PATCH 01/20] ref: don't pass DataModel into BlockLearners It's not needed in there, so should just pass what is needed. This is prep for further refactoring of removing datamodel more --- dedupe/labeler.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 9598292c..5f19e304 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -212,7 +212,7 @@ def _filter_canopy_predicates( class DedupeBlockLearner(BlockLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], data: Data, index_include: TrainingExamples, ): @@ -224,7 +224,7 @@ def __init__( index_data = sample_records(data, 50000) sampled_records = sample_records(index_data, N_SAMPLED_RECORDS) - preds = _filter_canopy_predicates(data_model.predicates, canopies=True) + preds = _filter_canopy_predicates(candidate_predicates, canopies=True) self.block_learner = training.DedupeBlockLearner( preds, sampled_records, index_data ) @@ -262,7 +262,7 @@ def _sample(self, data: Data, sample_size: int) -> TrainingExamples: class RecordLinkBlockLearner(BlockLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], data_1: Data, data_2: Data, index_include: TrainingExamples, @@ -276,7 +276,7 @@ def __init__( index_data = sample_records(data_2, 50000) sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS) - preds = _filter_canopy_predicates(data_model.predicates, canopies=False) + preds = _filter_canopy_predicates(candidate_predicates, canopies=False) self.block_learner = training.RecordLinkBlockLearner( preds, sampled_records_1, sampled_records_2, index_data ) @@ -394,7 +394,7 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = DedupeBlockLearner(data_model, data, index_include) + self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include) self._candidates = self.blocker.candidates.copy() @@ -429,7 +429,9 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include) + self.blocker = RecordLinkBlockLearner( + data_model.predicates, data_1, data_2, index_include + ) self._candidates = self.blocker.candidates self.matcher = MatchLearner(self.data_model, self.candidates) From 3f75472dff9c7afb3093aff96df7f778fc592e12 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:15:23 -0800 Subject: [PATCH 02/20] ref: don't store DataModel in DisagreementLearner We don't use it after the initial construction, so don't store it --- dedupe/labeler.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 5f19e304..0ab8d9ea 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -318,8 +318,7 @@ class DisagreementLearner(HasCandidates): matcher: MatchLearner blocker: BlockLearner - def __init__(self, data_model: DataModel) -> None: - self.data_model = data_model + def __init__(self) -> None: self.y: numpy.typing.NDArray[numpy.int_] = numpy.array([]) self.pairs: TrainingExamples = [] @@ -381,8 +380,7 @@ def __init__( data: Data, index_include: TrainingExamples, ): - super().__init__(data_model) - + super().__init__() data = core.index(data) random_pair = ( @@ -398,7 +396,7 @@ def __init__( self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(self.data_model, self.candidates) + self.matcher = MatchLearner(data_model, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -413,8 +411,7 @@ def __init__( data_2: Data, index_include: TrainingExamples, ): - super().__init__(data_model) - + super().__init__() data_1 = core.index(data_1) offset = len(data_1) @@ -434,7 +431,7 @@ def __init__( ) self._candidates = self.blocker.candidates - self.matcher = MatchLearner(self.data_model, self.candidates) + self.matcher = MatchLearner(data_model, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] From 7aa69eab26709c6af87451d76a46c49a6b53630d Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:18:50 -0800 Subject: [PATCH 03/20] ref: Don't require DataModel in MatchLearner We only need the abstract requirement of a Featurizer function. --- dedupe/_typing.py | 6 ++++++ dedupe/labeler.py | 26 +++++++++----------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/dedupe/_typing.py b/dedupe/_typing.py index 1ab25018..b2e6d7e7 100644 --- a/dedupe/_typing.py +++ b/dedupe/_typing.py @@ -80,6 +80,12 @@ class TrainingData(TypedDict): distinct: List[RecordDictPair] +# Takes pairs of records and generates a (n_samples X n_features) array +FeaturizerFunction = Callable[ + [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float_] +] + + class Classifier(Protocol): """Takes an array of pairwise distances and computes the likelihood they are a pair.""" diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 0ab8d9ea..88f332ed 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from typing import Dict, Iterable, Literal, Mapping - from dedupe._typing import Data, Labels, LabelsLike + from dedupe._typing import Data, FeaturizerFunction, Labels, LabelsLike from dedupe._typing import RecordDictPair as TrainingExample from dedupe._typing import RecordDictPairs as TrainingExamples from dedupe._typing import RecordIDPair @@ -70,33 +70,25 @@ def _verify_fit_args(pairs: TrainingExamples, y: LabelsLike) -> list[Literal[0, class MatchLearner(Learner): - def __init__(self, data_model: DataModel, candidates: TrainingExamples): - self.data_model = data_model + def __init__(self, featurizer: FeaturizerFunction, candidates: TrainingExamples): + self._featurizer = featurizer self._candidates = candidates.copy() self._classifier = sklearn.linear_model.LogisticRegression() - self._distances = self._calc_distances(self.candidates) + self._features = self._featurizer(self.candidates) def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None: y = self._verify_fit_args(pairs, y) - self._classifier.fit(self._calc_distances(pairs), numpy.array(y)) + self._classifier.fit(self._featurizer(pairs), numpy.array(y)) self._fitted = True def remove(self, index: int) -> None: self._candidates.pop(index) - self._distances = numpy.delete(self._distances, index, axis=0) + self._features = numpy.delete(self._features, index, axis=0) def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]: if not self._fitted: raise ValueError("Must call fit() before candidate_scores()") - scores: numpy.typing.NDArray[numpy.float_] = self._classifier.predict_proba( - self._distances - )[:, 1].reshape(-1, 1) - return scores - - def _calc_distances( - self, pairs: TrainingExamples - ) -> numpy.typing.NDArray[numpy.float_]: - return self.data_model.distances(pairs) + return self._classifier.predict_proba(self._features)[:, 1].reshape(-1, 1) class BlockLearner(Learner): @@ -396,7 +388,7 @@ def __init__( self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model, self.candidates) + self.matcher = MatchLearner(data_model.distances, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -431,7 +423,7 @@ def __init__( ) self._candidates = self.blocker.candidates - self.matcher = MatchLearner(data_model, self.candidates) + self.matcher = MatchLearner(data_model.distances, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] From 7e5633a34da0e1028a40f1404e9622368390e736 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:22:00 -0800 Subject: [PATCH 04/20] bugfix:? Copy candidate preds in RecordLinkDisagreementLearner It is copied in DedupeDisagreementLearner, so I'm assuming that's a good idea. There's surely no downside that I can see, so might as well DEFINITELY avoid a foot gun --- dedupe/labeler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 88f332ed..b6fe17b0 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -421,7 +421,7 @@ def __init__( self.blocker = RecordLinkBlockLearner( data_model.predicates, data_1, data_2, index_include ) - self._candidates = self.blocker.candidates + self._candidates = self.blocker.candidates.copy() self.matcher = MatchLearner(data_model.distances, self.candidates) From 1d1b8714d31690db3949f80c4a43c6b621ed51af Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:38:29 -0800 Subject: [PATCH 05/20] ref: Remove use of DataModel from active learners --- dedupe/api.py | 6 ++++-- dedupe/labeler.py | 15 ++++++++------- tests/test_labeler.py | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index dc54f7b3..713ee183 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -1325,7 +1325,8 @@ def prepare_training( examples, y = flatten_training(self.training_pairs) self.active_learner = labeler.DedupeDisagreementLearner( - self.data_model, + self.data_model.predicates, + self.data_model.distances, data, index_include=examples, ) @@ -1392,7 +1393,8 @@ def prepare_training( examples, y = flatten_training(self.training_pairs) self.active_learner = labeler.RecordLinkDisagreementLearner( - self.data_model, + self.data_model.predicates, + self.data_model.distances, data_1, data_2, index_include=examples, diff --git a/dedupe/labeler.py b/dedupe/labeler.py index b6fe17b0..8c51dd69 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -19,7 +19,6 @@ from dedupe._typing import RecordDictPair as TrainingExample from dedupe._typing import RecordDictPairs as TrainingExamples from dedupe._typing import RecordIDPair - from dedupe.datamodel import DataModel from dedupe.predicates import Predicate @@ -368,7 +367,8 @@ def learn_predicates( class DedupeDisagreementLearner(DisagreementLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], + featurizer: FeaturizerFunction, data: Data, index_include: TrainingExamples, ): @@ -384,11 +384,11 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include) + self.blocker = DedupeBlockLearner(candidate_predicates, data, index_include) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model.distances, self.candidates) + self.matcher = MatchLearner(featurizer, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -398,7 +398,8 @@ def __init__( class RecordLinkDisagreementLearner(DisagreementLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], + featurizer: FeaturizerFunction, data_1: Data, data_2: Data, index_include: TrainingExamples, @@ -419,11 +420,11 @@ def __init__( index_include.append(exact_match) self.blocker = RecordLinkBlockLearner( - data_model.predicates, data_1, data_2, index_include + candidate_predicates, data_1, data_2, index_include ) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model.distances, self.candidates) + self.matcher = MatchLearner(featurizer, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 7f9c8df2..30609ffa 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -43,7 +43,9 @@ def test_AL(self): ({"name": "William", "age": "35"}, {"name": "Jimbo", "age": "21"}), ] EXPECTED_CANDIDATES = {freeze_record_pair(pair) for pair in EXPECTED_CANDIDATES} - active_learner = labeler.DedupeDisagreementLearner(self.data_model, SAMPLE, []) + active_learner = labeler.DedupeDisagreementLearner( + self.data_model.predicates, self.data_model.distances, SAMPLE, [] + ) actual_candidates = set() for i in range(len(EXPECTED_CANDIDATES), 0, -1): assert len(active_learner) == i From 7350aedd0940c14d01d2c768caa7e31838629b0d Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:48:28 -0800 Subject: [PATCH 06/20] ref: Don't use DataModel in core --- dedupe/api.py | 4 ++-- dedupe/core.py | 26 +++++++++++++------------- tests/test_core.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 713ee183..d2733e00 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -107,7 +107,7 @@ def score(self, pairs: RecordPairs) -> Scores: """ try: matches = core.scoreDuplicates( - pairs, self.data_model, self.classifier, self.num_cores + pairs, self.data_model.distances, self.classifier, self.num_cores ) except RuntimeError: raise RuntimeError( @@ -824,7 +824,7 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]: """ matches = core.scoreGazette( - blocks, self.data_model, self.classifier, self.num_cores + blocks, self.data_model.distances, self.classifier, self.num_cores ) return matches diff --git a/dedupe/core.py b/dedupe/core.py index 7d739af5..64a74f66 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -34,6 +34,7 @@ Classifier, ClosableJoinable, Data, + FeaturizerFunction, Literal, MapLike, RecordID, @@ -41,7 +42,6 @@ RecordPairs, Scores, ) - from dedupe.datamodel import DataModel _Queue = Union[multiprocessing.dummy.Queue, multiprocessing.Queue] @@ -53,7 +53,7 @@ class BlockingError(Exception): class ScoreDupes(object): def __init__( self, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, records_queue: _Queue, exception_queue: _Queue, @@ -61,7 +61,7 @@ def __init__( dtype: numpy.dtype, offset, ): - self.data_model = data_model + self.featurizer = featurizer self.classifier = classifier self.records_queue = records_queue self.exception_queue = exception_queue @@ -87,8 +87,8 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None: if not records: return - distances = self.data_model.distances(records) - scores = self.classifier.predict_proba(distances)[:, -1] + features = self.featurizer(records) + scores = self.classifier.predict_proba(features)[:, -1] mask = scores > 0 if not mask.any(): @@ -113,7 +113,7 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None: def scoreDuplicates( record_pairs: RecordPairs, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, num_cores: int = 1, ) -> Scores: @@ -145,7 +145,7 @@ def scoreDuplicates( n_map_processes = max(num_cores, 1) score_records = ScoreDupes( - data_model, + featurizer, classifier, record_pairs_queue, exception_queue, @@ -200,15 +200,15 @@ def fillQueue( class ScoreGazette(object): - def __init__(self, data_model: DataModel, classifier: Classifier): - self.data_model = data_model + def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier): + self.featurizer = featurizer self.classifier = classifier def __call__(self, block: Block) -> Scores: record_ids, records = zip(*(zip(*each) for each in block)) - distances = self.data_model.distances(records) - scores = self.classifier.predict_proba(distances)[:, -1] + features = self.featurizer(records) + scores = self.classifier.predict_proba(features)[:, -1] id_type = sniff_id_type(record_ids) ids = numpy.array(record_ids, dtype=id_type) @@ -227,7 +227,7 @@ def __call__(self, block: Block) -> Scores: def scoreGazette( record_pairs: Blocks, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, num_cores: int = 1, ) -> Generator[Scores, None, None]: @@ -238,7 +238,7 @@ def scoreGazette( imap, pool = appropriate_imap(num_cores) - score_records = ScoreGazette(data_model, classifier) + score_records = ScoreGazette(featurizer, classifier) for scored_pairs in imap(score_records, record_pairs): yield scored_pairs diff --git a/tests/test_core.py b/tests/test_core.py index 2b4fc1c5..9754f154 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -70,7 +70,7 @@ def setUp(self): def test_score_duplicates(self): scores = dedupe.core.scoreDuplicates( - self.records, self.data_model, self.classifier, 2 + self.records, self.data_model.distances, self.classifier, 2 ) numpy.testing.assert_equal(scores["pairs"], self.desired_scored_pairs["pairs"]) @@ -94,7 +94,7 @@ def test_score_duplicates_with_zeros(self): expected = numpy.array([(["3", "4"], 1)], dtype=dtype) scores = dedupe.core.scoreDuplicates( - records, self.data_model, self.classifier, 2 + records, self.data_model.distances, self.classifier, 2 ) assert isinstance(scores, numpy.memmap) From 3c204171d1c106580a7de181ff04d0ecbc5cfa53 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 11:18:47 -0800 Subject: [PATCH 07/20] ref: remove __len__ from DataModel It isn't used publicly anywhere, and it adds more API surface area to worry about. This is still backwards compatible, as self._len is still saved/restored byt pickle the same way, now we just access it directly instead of going via __len__(). --- dedupe/datamodel.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index bb433565..a1d50d0d 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -50,9 +50,6 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): self._len = len(all_variables) - def __len__(self) -> int: - return self._len - # Changing this from a property to just a normal attribute causes # pickling problems, because we are removing static methods from # their class context. This could be fixed by defining comparators @@ -82,7 +79,7 @@ def distances( ) -> numpy.typing.NDArray[numpy.float_]: num_records = len(record_pairs) - distances = numpy.empty((num_records, len(self)), "f4") + distances = numpy.empty((num_records, self._len), "f4") for i, (record_1, record_2) in enumerate(record_pairs): From 25f91a5f1944c222dd628ed1afa811c174da7bce Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 11:47:02 -0800 Subject: [PATCH 08/20] ref: make typifying variables more clear --- dedupe/datamodel.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index a1d50d0d..b107fb27 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -38,8 +38,8 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - all_variables: list[Variable] - self.primary_variables, all_variables = typify_variables(variable_definitions) + self.primary_variables = typify_variables(variable_definitions) + all_variables = _expand_higher_variables(self.primary_variables) self._derived_start = len(all_variables) all_variables += interactions(variable_definitions, self.primary_variables) @@ -141,9 +141,8 @@ def __setstate__(self, d): def typify_variables( variable_definitions: Iterable[VariableDefinition], -) -> tuple[list[FieldVariable], list[Variable]]: - primary_variables: list[FieldVariable] = [] - all_variables: list[Variable] = [] +) -> list[FieldVariable]: + variables: list[FieldVariable] = [] only_custom = True for definition in variable_definitions: @@ -188,13 +187,7 @@ def typify_variables( variable_object = variable_class(definition) assert isinstance(variable_object, FieldVariable) - primary_variables.append(variable_object) - - if hasattr(variable_object, "higher_vars"): - all_variables.extend(variable_object.higher_vars) - else: - variable_object = cast(Variable, variable_object) - all_variables.append(variable_object) + variables.append(variable_object) if only_custom: raise ValueError( @@ -203,7 +196,17 @@ def typify_variables( "blocking rules" ) - return primary_variables, all_variables + return variables + + +def _expand_higher_variables(variables: Iterable[Variable]) -> list[Variable]: + result: list[Variable] = [] + for variable in variables: + if hasattr(variable, "higher_vars"): + result.extend(variable.higher_vars) + else: + result.append(variable) + return result def missing(variables: list[Variable]) -> list[MissingDataType]: From b2234e83e9528b1531596068ea8cf615cf1fc6be Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:47:33 -0800 Subject: [PATCH 09/20] ref: tweak error raising in typify_variables() --- dedupe/datamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index b107fb27..2249dbbd 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -179,9 +179,9 @@ def typify_variables( try: variable_class = VARIABLE_CLASSES[variable_type] except KeyError: + valid = ", ".join(VARIABLE_CLASSES) raise KeyError( - "Field type %s not valid. Valid types include %s" - % (definition["type"], ", ".join(VARIABLE_CLASSES)) + f"Variable type {variable_type} not valid. Valid types include {valid}" ) variable_object = variable_class(definition) From 13b9f5061f29e6e428d401305fa1f67a7c02ae03 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:50:59 -0800 Subject: [PATCH 10/20] ref: Clarify only_custom logic in typify_variables --- dedupe/datamodel.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 2249dbbd..83fbeae9 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -8,6 +8,7 @@ import numpy import dedupe.variables +from dedupe.variables.base import CustomType from dedupe.variables.base import FieldType as FieldVariable from dedupe.variables.base import MissingDataType, Variable from dedupe.variables.interaction import InteractionType @@ -143,8 +144,6 @@ def typify_variables( variable_definitions: Iterable[VariableDefinition], ) -> list[FieldVariable]: variables: list[FieldVariable] = [] - only_custom = True - for definition in variable_definitions: try: variable_type = definition["type"] @@ -163,9 +162,6 @@ def typify_variables( "{'field' : 'Phone', type: 'String'}" ) - if variable_type != "Custom": - only_custom = False - if variable_type == "Interaction": continue @@ -189,6 +185,7 @@ def typify_variables( variables.append(variable_object) + only_custom = all(isinstance(v, CustomType) for v in variables) if only_custom: raise ValueError( "At least one of the variable types needs to be a type" From 78ef7ffb4766074b0926ff967ffada1fecba4b62 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:55:16 -0800 Subject: [PATCH 11/20] typ: Fix typing of typify_variables() We iterate through the input more than once, so an Iterable is insufficient. --- dedupe/datamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 83fbeae9..269ca576 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -141,7 +141,7 @@ def __setstate__(self, d): def typify_variables( - variable_definitions: Iterable[VariableDefinition], + variable_definitions: list[VariableDefinition], ) -> list[FieldVariable]: variables: list[FieldVariable] = [] for definition in variable_definitions: From 77b4192bd6cf854f2d2efc923ef9ae614d3585c1 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 13:12:54 -0800 Subject: [PATCH 12/20] ref: Further rename field to variable in datamodel --- dedupe/datamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 269ca576..bbca107a 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -217,13 +217,13 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: def interactions( definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] ) -> list[InteractionType]: - field_d = {field.name: field for field in primary_variables} + var_d = {var.name: var for var in primary_variables} interactions = [] for definition in definitions: if definition["type"] == "Interaction": var = InteractionType(definition) - var.expandInteractions(field_d) + var.expandInteractions(var_d) interactions.extend(var.higher_vars) return interactions From 497badc2ef51caf20d71223c1f435c4f11de20d4 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 13:46:38 -0800 Subject: [PATCH 13/20] test: Add more tests for interaction variables --- tests/test_api.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 84ac9169..32458a17 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -58,11 +58,13 @@ def test_initialize_fields(self): [], ) + # only customs with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}], ) + # Only customs with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [ @@ -71,6 +73,24 @@ def test_initialize_fields(self): ], ) + # Only custom and interactions + with self.assertRaises(ValueError): + dedupe.api.ActiveMatching( + [ + {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, + {"field": "age", "type": "Custom", "comparator": lambda x, y: 1}, + {"type": "Interaction", "interaction variables": ["name", "age"]}, + ], + ) + + # Only interactions + with self.assertRaises(ValueError): + dedupe.api.ActiveMatching( + [ + {"type": "Interaction", "interaction variables": []}, + ], + ) + dedupe.api.ActiveMatching( [ {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, @@ -78,6 +98,19 @@ def test_initialize_fields(self): ], ) + dedupe.api.ActiveMatching( + [ + {"field": "name", "variable name": "name", "type": "String"}, + { + "field": "age", + "variable name": "age", + "type": "Custom", + "comparator": lambda x, y: 1, + }, + {"type": "Interaction", "interaction variables": ["name", "age"]}, + ], + ) + def test_check_record(self): matcher = dedupe.api.ActiveMatching(self.field_definition) From c04981d7bb54a7809ea3d7dff09c0e8c98bb3bff Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 14:43:50 -0800 Subject: [PATCH 14/20] Ensure unique variable names Before, we never checked for this, so when we called list.index() we just got back the first instance. I swapped us over to a lookup table because that makes more sense, but I wanted to also add this check because if someone had duplicate names, this would quitely change the behavior underneath someone: before it gave the first index, now it gives the last. --- dedupe/datamodel.py | 16 ++++++++++++++-- tests/test_api.py | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index bbca107a..8f5b4108 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -233,15 +233,27 @@ def missing_field_indices(variables: list[Variable]) -> list[int]: def interaction_indices(variables: list[Variable]) -> list[list[int]]: - var_names = [var.name for var in variables] + _ensure_unique_names(variables) + name_to_index = {var.name: i for i, var in enumerate(variables)} indices = [] for var in variables: if hasattr(var, "interaction_fields"): - interaction_indices = [var_names.index(f) for f in var.interaction_fields] # type: ignore + interaction_indices = [name_to_index[f] for f in var.interaction_fields] # type: ignore indices.append(interaction_indices) return indices +def _ensure_unique_names(variables: Iterable[Variable]) -> None: + seen = set() + for var in variables: + if var.name in seen: + raise ValueError( + "Variable name used more than once! " + "Choose a unique name for each variable: '{var.name}'" + ) + seen.add(var.name) + + def reduce_method(m): # type: ignore[no-untyped-def] return (getattr, (m.__self__, m.__func__.__name__)) diff --git a/tests/test_api.py b/tests/test_api.py index 32458a17..6f9f40dd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -91,6 +91,26 @@ def test_initialize_fields(self): ], ) + # Duplicate variable names (explicitly) + with self.assertRaises(ValueError) as e: + dedupe.api.ActiveMatching( + [ + {"field": "age", "type": "String", "variable name": "my_age"}, + {"field": "age", "type": "ShortString", "variable name": "my_age"}, + ], + ) + assert "Variable name used more than once!" in str(e.exception) + + # Duplicate variable names (implicitly) + with self.assertRaises(ValueError) as e: + dedupe.api.ActiveMatching( + [ + {"field": "age", "type": "String"}, + {"field": "age", "type": "String"}, + ], + ) + assert "Variable name used more than once!" in str(e.exception) + dedupe.api.ActiveMatching( [ {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, From 3e55496e4fabe5ca1845dd84343f6b38c9ada34f Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 14:52:35 -0800 Subject: [PATCH 15/20] ref: Remove "type" from non-exposed variables These variables 1. you can't currently create them because they inherit from Variable, not FieldVariable, so they don't appear in the datamodel.VARIABLE_CLASSES lookup table 2. You SHOULDN'T be able to instantiate these variables directly from a variable definition --- dedupe/variables/base.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index f80b28fa..c958db5e 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -58,21 +58,16 @@ def all_subclasses( class DerivedType(Variable): - type = "Derived" - def __init__(self, definition: VariableDefinition): self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"])) super(DerivedType, self).__init__(definition) class MissingDataType(Variable): - type = "MissingData" + has_missing = False def __init__(self, name: str): - - self.name = "(%s: Not Missing)" % name - - self.has_missing = False + self.name = f"({name}: Not Missing)" class FieldType(Variable): From f32f511b41089886f8075b4a01fbda24c5ff917f Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 15:29:01 -0800 Subject: [PATCH 16/20] ref: Create InteractionVariables the same as the rest Before the flow was: 1. create all the primary variables EXCEPT for the interactions 2. Expand those. 3. go back through and NOW create the InteractionVariables I found this confusing. We parse the var definitions twice in separate places, and InteractionVariables are a special case in the first pass. The other point of this is to make it so that there is one list of Variable instances which is the single source of truth. Once we do this then we can turn all the other private instance variables of DataModel into @functools.cached_property's, which will make it much easier to ensure pickle compatibility in the future, because only one variable needs to get saved and restored --- dedupe/datamodel.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 8f5b4108..c4962b87 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -29,7 +29,7 @@ ) from dedupe.predicates import Predicate -VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} +VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k} class DataModel(object): @@ -39,11 +39,16 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - self.primary_variables = typify_variables(variable_definitions) - all_variables = _expand_higher_variables(self.primary_variables) - self._derived_start = len(all_variables) - - all_variables += interactions(variable_definitions, self.primary_variables) + variables = typify_variables(variable_definitions) + non_interactions: list[FieldVariable] = [ + v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] + ] + self.primary_variables = non_interactions + expanded_primary = _expand_higher_variables(self.primary_variables) + self._derived_start = len(expanded_primary) + + all_variables = expanded_primary.copy() + all_variables += _expanded_interactions(variables) all_variables += missing(all_variables) self._missing_field_indices = missing_field_indices(all_variables) @@ -140,10 +145,8 @@ def __setstate__(self, d): self.__dict__ = d -def typify_variables( - variable_definitions: list[VariableDefinition], -) -> list[FieldVariable]: - variables: list[FieldVariable] = [] +def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: + variables: list[Variable] = [] for definition in variable_definitions: try: variable_type = definition["type"] @@ -162,9 +165,6 @@ def typify_variables( "{'field' : 'Phone', type: 'String'}" ) - if variable_type == "Interaction": - continue - if variable_type == "FuzzyCategorical" and "other fields" not in definition: definition["other fields"] = [ # type: ignore d["field"] @@ -181,11 +181,11 @@ def typify_variables( ) variable_object = variable_class(definition) - assert isinstance(variable_object, FieldVariable) + assert isinstance(variable_object, Variable) variables.append(variable_object) - only_custom = all(isinstance(v, CustomType) for v in variables) + only_custom = all(isinstance(v, (CustomType, InteractionType)) for v in variables) if only_custom: raise ValueError( "At least one of the variable types needs to be a type" @@ -214,16 +214,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: return missing_variables -def interactions( - definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] -) -> list[InteractionType]: - var_d = {var.name: var for var in primary_variables} - +def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]: + field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)} interactions = [] - for definition in definitions: - if definition["type"] == "Interaction": - var = InteractionType(definition) - var.expandInteractions(var_d) + for var in variables: + if isinstance(var, InteractionType): + var.expandInteractions(field_vars) interactions.extend(var.higher_vars) return interactions From e169cc221153bc524b03432a475a6d55aea42234 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 15:41:26 -0800 Subject: [PATCH 17/20] ref: Move check for empty var def into typify_variables --- dedupe/datamodel.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index c4962b87..053b8ee1 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -36,9 +36,6 @@ class DataModel(object): version = 1 def __init__(self, variable_definitions: Iterable[VariableDefinition]): - variable_definitions = list(variable_definitions) - if not variable_definitions: - raise ValueError("The variable definitions cannot be empty") variables = typify_variables(variable_definitions) non_interactions: list[FieldVariable] = [ v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] @@ -145,7 +142,13 @@ def __setstate__(self, d): self.__dict__ = d -def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: +def typify_variables( + variable_definitions: Iterable[VariableDefinition], +) -> list[Variable]: + variable_definitions = list(variable_definitions) + if not variable_definitions: + raise ValueError("The variable definitions cannot be empty") + variables: list[Variable] = [] for definition in variable_definitions: try: @@ -179,10 +182,8 @@ def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Var raise KeyError( f"Variable type {variable_type} not valid. Valid types include {valid}" ) - variable_object = variable_class(definition) assert isinstance(variable_object, Variable) - variables.append(variable_object) only_custom = all(isinstance(v, (CustomType, InteractionType)) for v in variables) From c181763010bb3526d0bfd6b767b7c367de368170 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 12:32:36 -0800 Subject: [PATCH 18/20] Move _load_settings() into separate function This will make the init method cleaner and allow us to more easily add versioning. _load_settings will read the settings file, and for future versions of the class, will parse and convert the loaded data structures to the canonical in-memory representation that the class expects. See following commits --- dedupe/api.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index d2733e00..39009be9 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -978,10 +978,25 @@ def __init__( """ super().__init__(num_cores, in_memory, **kwargs) + self.data_model, self.classifier, self.predicates = self._load_settings( + settings_file + ) + + logger.info("Predicate set:") + for predicate in self.predicates: + logger.info(predicate) + + self._fingerprinter = blocking.Fingerprinter(self.predicates) + + @staticmethod + def _load_settings( + settings_file: BinaryIO, + ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]: try: - self.data_model = pickle.load(settings_file) - self.classifier = pickle.load(settings_file) - self.predicates = pickle.load(settings_file) + data_model = pickle.load(settings_file) + classifier = pickle.load(settings_file) + predicates = pickle.load(settings_file) + return data_model, classifier, predicates except (KeyError, AttributeError): raise SettingsFileLoadingException( "This settings file is not compatible with " @@ -1007,12 +1022,6 @@ def __init__( "Try deleting the file" ) - logger.info("Predicate set:") - for predicate in self.predicates: - logger.info(predicate) - - self._fingerprinter = blocking.Fingerprinter(self.predicates) - class ActiveMatching(Matching): """ From 241c0a65ef555cc36a03ef13ae612aec7c3c999b Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 17:45:04 -0800 Subject: [PATCH 19/20] ref: Add versioning to _load_settings() --- dedupe/api.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 39009be9..b1059638 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -14,6 +14,7 @@ import sqlite3 import tempfile import warnings +from io import BytesIO from typing import TYPE_CHECKING, cast import numpy @@ -988,15 +989,23 @@ def __init__( self._fingerprinter = blocking.Fingerprinter(self.predicates) - @staticmethod + @classmethod def _load_settings( - settings_file: BinaryIO, + cls, settings_file: BinaryIO ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]: + # Make a copy so we can read it multiple times + settings_file = BytesIO(settings_file.read()) + settings_file.seek(0) try: - data_model = pickle.load(settings_file) - classifier = pickle.load(settings_file) - predicates = pickle.load(settings_file) - return data_model, classifier, predicates + version = pickle.load(settings_file) + if not isinstance(version, int): + settings_file.seek(0) + return cls._load_settings_v0(settings_file) + else: + raise SettingsFileLoadingException( + "Settings file version {} not understood".format(version) + ) + except (KeyError, AttributeError): raise SettingsFileLoadingException( "This settings file is not compatible with " @@ -1022,6 +1031,15 @@ def _load_settings( "Try deleting the file" ) + @staticmethod + def _load_settings_v0( + settings_file: BinaryIO, + ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]: + data_model = pickle.load(settings_file) + classifier = pickle.load(settings_file) + predicates = pickle.load(settings_file) + return data_model, classifier, predicates + class ActiveMatching(Matching): """ From 83a2e8927124e39321e44e3404034f630396740f Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 17:48:21 -0800 Subject: [PATCH 20/20] ref: improve settings loading exception reporting Deduplicate the one exception we make. Also now instead of just eating the catchall exception, we raise from it so that you have slightly more insight into its cause --- dedupe/api.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index b1059638..83168367 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -996,6 +996,10 @@ def _load_settings( # Make a copy so we can read it multiple times settings_file = BytesIO(settings_file.read()) settings_file.seek(0) + catchall_exception = SettingsFileLoadingException( + "Something has gone wrong with loading the settings file. " + "Try deleting the file" + ) try: version = pickle.load(settings_file) if not isinstance(version, int): @@ -1021,15 +1025,9 @@ def _load_settings( "install that library: `pip install rlr`" ) else: - raise SettingsFileLoadingException( - "Something has gone wrong with loading the settings file. " - "Try deleting the file" - ) from exc - except: # noqa: E722 - raise SettingsFileLoadingException( - "Something has gone wrong with loading the settings file. " - "Try deleting the file" - ) + raise + except Exception as exc: + raise catchall_exception from exc @staticmethod def _load_settings_v0(