facebookresearch · johncadigan · Jul 27, 2019 · Aug 13, 2019
diff --git a/README.md b/README.md
@@ -14,6 +14,13 @@ We include two methods, one *supervised* that uses a bilingual dictionary or ide
 
 MUSE is available on CPU or GPU, in Python 2 or 3. Faiss is *optional* for GPU users - though Faiss-GPU will greatly speed up nearest neighbor search - and *highly recommended* for CPU users. Faiss can be installed using "conda install faiss-cpu -c pytorch" or "conda install faiss-gpu -c pytorch".
 
+## Installation
+```bash
+python -m pip install .
+```
+
+This adds bash scripts and entry point scripts to the path.
+
 ## Get evaluation datasets
 To download monolingual and cross-lingual word embeddings evaluation datasets:
 * Our 110 [bilingual dictionaries](https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries)
@@ -24,17 +31,18 @@ To download monolingual and cross-lingual word embeddings evaluation datasets:
 You can simply run:
 
 ```bash
+mkdir data
 cd data/
 wget https://dl.fbaipublicfiles.com/arrival/vectors.tar.gz
 wget https://dl.fbaipublicfiles.com/arrival/wordsim.tar.gz
 wget https://dl.fbaipublicfiles.com/arrival/dictionaries.tar.gz
+cd ..
 ```
 
 Alternatively, you can also download the data with:
 
 ```bash
-cd data/
-./get_evaluation.sh
+get_evaluation.sh
 ```
 
 *Note: Requires bash 4. The download of Europarl is disabled by default (slow), you can enable it [here](https://github.com/facebookresearch/MUSE/blob/master/data/get_evaluation.sh#L99-L100).*
@@ -60,14 +68,14 @@ For more details on these approaches, please check [here](https://arxiv.org/pdf/
 ### The supervised way: iterative Procrustes (CPU|GPU)
 To learn a mapping between the source and the target space, simply run:
 ```bash
-python supervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 --dico_train default
+supervised --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 --dico_train default
 ```
 By default, *dico_train* will point to our ground-truth dictionaries (downloaded above); when set to "identical_char" it will use identical character strings between source and target languages to form a vocabulary. Logs and embeddings will be saved in the dumped/ directory.
 
 ### The unsupervised way: adversarial training and refinement (CPU|GPU)
 To learn a mapping using adversarial training and iterative Procrustes refinement, run:
 ```bash
-python unsupervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5
+unsupervised --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5
 ```
 By default, the validation metric is the mean cosine of word pairs from a synthetic dictionary built with CSLS (Cross-domain similarity local scaling). For some language pairs (e.g. En-Zh),
 we recommend to center the embeddings using `--normalize_embeddings center`.
@@ -77,12 +85,12 @@ We also include a simple script to evaluate the quality of monolingual or cross-
 
 **Monolingual**
 ```bash
-python evaluate.py --src_lang en --src_emb data/wiki.en.vec --max_vocab 200000
+evaluate --src_lang en --src_emb data/wiki.en.vec --max_vocab 200000
 ```
 
 **Cross-lingual**
 ```bash
-python evaluate.py --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec --max_vocab 200000
+evaluate --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec --max_vocab 200000
 ```
 
 ## Word embedding format

diff --git a/data/get_evaluation.sh → bin/get_evaluation.sh b/data/get_evaluation.sh → bin/get_evaluation.sh
diff --git a/evaluate.py b/evaluate.py
diff --git a/src/__init__.py → facebook_muse/__init__.py b/src/__init__.py → facebook_muse/__init__.py
diff --git a/src/dico_builder.py → facebook_muse/dico_builder.py b/src/dico_builder.py → facebook_muse/dico_builder.py
@@ -10,8 +10,7 @@
 
 from .utils import get_nn_avg_dist
 
-
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 def get_candidates(emb1, emb2, params):

diff --git a/src/dictionary.py → facebook_muse/dictionary.py b/src/dictionary.py → facebook_muse/dictionary.py
@@ -8,7 +8,7 @@
 from logging import getLogger
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 class Dictionary(object):

diff --git a/facebook_muse/evaluate.py b/facebook_muse/evaluate.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+# python evaluate.py --crosslingual --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec
+
+import os
+import argparse
+from collections import OrderedDict
+
+from facebook_muse.utils import bool_flag, initialize_exp
+from facebook_muse.models import build_model
+from facebook_muse.trainer import Trainer
+from facebook_muse.evaluation import Evaluator
+
+# main
+def main():
+    parser = argparse.ArgumentParser(description='Evaluation')
+    parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
+    parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
+    parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
+    parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
+    parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
+    # data
+    parser.add_argument("--src_lang", type=str, default="", help="Source language")
+    parser.add_argument("--tgt_lang", type=str, default="", help="Target language")
+    parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary")
+    # reload pre-trained embeddings
+    parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings")
+    parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings")
+    parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)")
+    parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension")
+    parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")
+
+
+    # parse parameters
+    params = parser.parse_args()
+
+    # check parameters
+    assert params.src_lang, "source language undefined"
+    assert os.path.isfile(params.src_emb)
+    assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
+    assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
+
+    # build logger / model / trainer / evaluator
+    logger = initialize_exp(params)
+    src_emb, tgt_emb, mapping, _ = build_model(params, False)
+    trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
+    evaluator = Evaluator(trainer)
+
+    # run evaluations
+    to_log = OrderedDict({'n_iter': 0})
+    evaluator.monolingual_wordsim(to_log)
+    # evaluator.monolingual_wordanalogy(to_log)
+    if params.tgt_lang:
+        evaluator.crosslingual_wordsim(to_log)
+        evaluator.word_translation(to_log)
+        evaluator.sent_translation(to_log)
+        # evaluator.dist_mean_cosine(to_log)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/evaluation/__init__.py → facebook_muse/evaluation/__init__.py b/src/evaluation/__init__.py → facebook_muse/evaluation/__init__.py
diff --git a/src/evaluation/evaluator.py → facebook_muse/evaluation/evaluator.py b/src/evaluation/evaluator.py → facebook_muse/evaluation/evaluator.py
@@ -15,10 +15,10 @@
 from . import get_word_translation_accuracy
 from . import load_europarl_data, get_sent_translation_accuracy
 from ..dico_builder import get_candidates, build_dictionary
-from src.utils import get_idf
+from facebook_muse.utils import get_idf
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 class Evaluator(object):

diff --git a/src/evaluation/sent_translation.py → facebook_muse/evaluation/sent_translation.py b/src/evaluation/sent_translation.py → facebook_muse/evaluation/sent_translation.py
@@ -11,13 +11,13 @@
 import numpy as np
 import torch
 
-from src.utils import bow_idf, get_nn_avg_dist
+from facebook_muse.utils import bow_idf, get_nn_avg_dist
 
 
 EUROPARL_DIR = 'data/crosslingual/europarl'
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 def load_europarl_data(lg1, lg2, n_max=1e10, lower=True):

diff --git a/src/evaluation/word_translation.py → facebook_muse/evaluation/word_translation.py b/src/evaluation/word_translation.py → facebook_muse/evaluation/word_translation.py
@@ -14,10 +14,10 @@
 from ..utils import get_nn_avg_dist
 
 
-DIC_EVAL_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'data', 'crosslingual', 'dictionaries')
+DIC_EVAL_PATH = os.path.join('data', 'crosslingual', 'dictionaries')
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 def load_identical_char_dico(word2id1, word2id2):
@@ -46,6 +46,7 @@ def load_dictionary(path, word2id1, word2id2):
     Return a torch tensor of size (n, 2) where n is the size of the
     loader dictionary, and sort it by source word frequency.
     """
+    print(path)
     assert os.path.isfile(path)
 
     pairs = []

diff --git a/src/evaluation/wordsim.py → facebook_muse/evaluation/wordsim.py b/src/evaluation/wordsim.py → facebook_muse/evaluation/wordsim.py
@@ -17,7 +17,7 @@
 SEMEVAL17_EVAL_PATH = 'data/crosslingual/wordsim'
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 def get_word_pairs(path, lower=True):

diff --git a/src/logger.py → facebook_muse/logger.py b/src/logger.py → facebook_muse/logger.py
diff --git a/src/models.py → facebook_muse/models.py b/src/models.py → facebook_muse/models.py
diff --git a/facebook_muse/supervised.py b/facebook_muse/supervised.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import os
+import json
+import argparse
+from collections import OrderedDict
+import torch
+
+from facebook_muse.utils import bool_flag, initialize_exp
+from facebook_muse.models import build_model
+from facebook_muse.trainer import Trainer
+from facebook_muse.evaluation import Evaluator
+
+
+VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10'
+VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000'
+
+def main():
+    # main
+    parser = argparse.ArgumentParser(description='Supervised training')
+    parser.add_argument("--seed", type=int, default=-1, help="Initialization seed")
+    parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
+    parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
+    parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
+    parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
+    parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
+    parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)")
+
+    # data
+    parser.add_argument("--src_lang", type=str, default='en', help="Source language")
+    parser.add_argument("--tgt_lang", type=str, default='es', help="Target language")
+    parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension")
+    parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)")
+    # training refinement
+    parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)")
+    # dictionary creation parameters (for refinement)
+    parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)")
+    parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary")
+    parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)")
+    parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S")
+    parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation")
+    parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)")
+    parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)")
+    parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)")
+    # reload pre-trained embeddings
+    parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings")
+    parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings")
+    parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")
+
+
+    # parse parameters
+    params = parser.parse_args()
+
+    # check parameters
+    assert not params.cuda or torch.cuda.is_available()
+    assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train)
+    assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
+    assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
+    assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
+    assert os.path.isfile(params.src_emb)
+    assert os.path.isfile(params.tgt_emb)
+    assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
+    assert params.export in ["", "txt", "pth"]
+
+    # build logger / model / trainer / evaluator
+    logger = initialize_exp(params)
+    src_emb, tgt_emb, mapping, _ = build_model(params, False)
+    trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
+    evaluator = Evaluator(trainer)
+
+    # load a training dictionary. if a dictionary path is not provided, use a default
+    # one ("default") or create one based on identical character strings ("identical_char")
+    trainer.load_training_dico(params.dico_train)
+
+    # define the validation metric
+    VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP
+    logger.info("Validation metric: %s" % VALIDATION_METRIC)
+
+    """
+    Learning loop for Procrustes Iterative Learning
+    """
+    for n_iter in range(params.n_refinement + 1):
+
+        logger.info('Starting iteration %i...' % n_iter)
+
+        # build a dictionary from aligned embeddings (unless
+        # it is the first iteration and we use the init one)
+        if n_iter > 0 or not hasattr(trainer, 'dico'):
+            trainer.build_dictionary()
+
+        # apply the Procrustes solution
+        trainer.procrustes()
+
+        # embeddings evaluation
+        to_log = OrderedDict({'n_iter': n_iter})
+        evaluator.all_eval(to_log)
+
+        # JSON log / save best model / end of epoch
+        logger.info("__log__:%s" % json.dumps(to_log))
+        trainer.save_best(to_log, VALIDATION_METRIC)
+        logger.info('End of iteration %i.\n\n' % n_iter)
+
+
+    # export embeddings
+    if params.export:
+        trainer.reload_best()
+        trainer.export()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/trainer.py → facebook_muse/trainer.py b/src/trainer.py → facebook_muse/trainer.py
@@ -19,7 +19,7 @@
 from .evaluation.word_translation import DIC_EVAL_PATH, load_identical_char_dico, load_dictionary
 
 
-logger = getLogger()
+logger = getLogger(__name__)
 
 
 class Trainer(object):