From f76de0d610ea68eb5f3f56f32e02347c514b2478 Mon Sep 17 00:00:00 2001 From: John Cadigan Date: Fri, 26 Jul 2019 23:49:02 -0700 Subject: [PATCH 1/2] Added module structure and entry points --- README.md | 20 +++++++++----- {data => bin}/get_evaluation.sh | 0 {src => facebook_muse}/__init__.py | 0 {src => facebook_muse}/dico_builder.py | 3 +-- {src => facebook_muse}/dictionary.py | 2 +- evaluate.py => facebook_muse/evaluate.py | 8 +++--- {src => facebook_muse}/evaluation/__init__.py | 0 .../evaluation/evaluator.py | 4 +-- .../evaluation/sent_translation.py | 4 +-- .../evaluation/word_translation.py | 5 ++-- {src => facebook_muse}/evaluation/wordsim.py | 2 +- {src => facebook_muse}/logger.py | 0 {src => facebook_muse}/models.py | 0 supervised.py => facebook_muse/supervised.py | 8 +++--- {src => facebook_muse}/trainer.py | 2 +- .../unsupervised.py | 8 +++--- {src => facebook_muse}/utils.py | 2 +- setup.py | 26 +++++++++++++++++++ 18 files changed, 64 insertions(+), 30 deletions(-) rename {data => bin}/get_evaluation.sh (100%) rename {src => facebook_muse}/__init__.py (100%) rename {src => facebook_muse}/dico_builder.py (99%) rename {src => facebook_muse}/dictionary.py (98%) rename evaluate.py => facebook_muse/evaluate.py (92%) rename {src => facebook_muse}/evaluation/__init__.py (100%) rename {src => facebook_muse}/evaluation/evaluator.py (99%) rename {src => facebook_muse}/evaluation/sent_translation.py (98%) rename {src => facebook_muse}/evaluation/word_translation.py (97%) rename {src => facebook_muse}/evaluation/wordsim.py (99%) rename {src => facebook_muse}/logger.py (100%) rename {src => facebook_muse}/models.py (100%) rename supervised.py => facebook_muse/supervised.py (96%) rename {src => facebook_muse}/trainer.py (99%) rename unsupervised.py => facebook_muse/unsupervised.py (97%) rename {src => facebook_muse}/utils.py (99%) create mode 100644 setup.py diff --git a/README.md b/README.md index bd31356..568ef06 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,13 @@ We include two methods, one *supervised* that uses a bilingual dictionary or ide MUSE is available on CPU or GPU, in Python 2 or 3. Faiss is *optional* for GPU users - though Faiss-GPU will greatly speed up nearest neighbor search - and *highly recommended* for CPU users. Faiss can be installed using "conda install faiss-cpu -c pytorch" or "conda install faiss-gpu -c pytorch". +## Installation +```bash +python -m pip install . +``` + +This adds bash scripts and entry point scripts to the path. + ## Get evaluation datasets To download monolingual and cross-lingual word embeddings evaluation datasets: * Our 110 [bilingual dictionaries](https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries) @@ -24,17 +31,18 @@ To download monolingual and cross-lingual word embeddings evaluation datasets: You can simply run: ```bash +mkdir data cd data/ wget https://dl.fbaipublicfiles.com/arrival/vectors.tar.gz wget https://dl.fbaipublicfiles.com/arrival/wordsim.tar.gz wget https://dl.fbaipublicfiles.com/arrival/dictionaries.tar.gz +cd .. ``` Alternatively, you can also download the data with: ```bash -cd data/ -./get_evaluation.sh +get_evaluation.sh ``` *Note: Requires bash 4. The download of Europarl is disabled by default (slow), you can enable it [here](https://github.com/facebookresearch/MUSE/blob/master/data/get_evaluation.sh#L99-L100).* @@ -60,14 +68,14 @@ For more details on these approaches, please check [here](https://arxiv.org/pdf/ ### The supervised way: iterative Procrustes (CPU|GPU) To learn a mapping between the source and the target space, simply run: ```bash -python supervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 --dico_train default +supervised --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 --dico_train default ``` By default, *dico_train* will point to our ground-truth dictionaries (downloaded above); when set to "identical_char" it will use identical character strings between source and target languages to form a vocabulary. Logs and embeddings will be saved in the dumped/ directory. ### The unsupervised way: adversarial training and refinement (CPU|GPU) To learn a mapping using adversarial training and iterative Procrustes refinement, run: ```bash -python unsupervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 +unsupervised --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 ``` By default, the validation metric is the mean cosine of word pairs from a synthetic dictionary built with CSLS (Cross-domain similarity local scaling). For some language pairs (e.g. En-Zh), we recommend to center the embeddings using `--normalize_embeddings center`. @@ -77,12 +85,12 @@ We also include a simple script to evaluate the quality of monolingual or cross- **Monolingual** ```bash -python evaluate.py --src_lang en --src_emb data/wiki.en.vec --max_vocab 200000 +evaluate --src_lang en --src_emb data/wiki.en.vec --max_vocab 200000 ``` **Cross-lingual** ```bash -python evaluate.py --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec --max_vocab 200000 +evaluate --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec --max_vocab 200000 ``` ## Word embedding format diff --git a/data/get_evaluation.sh b/bin/get_evaluation.sh similarity index 100% rename from data/get_evaluation.sh rename to bin/get_evaluation.sh diff --git a/src/__init__.py b/facebook_muse/__init__.py similarity index 100% rename from src/__init__.py rename to facebook_muse/__init__.py diff --git a/src/dico_builder.py b/facebook_muse/dico_builder.py similarity index 99% rename from src/dico_builder.py rename to facebook_muse/dico_builder.py index 769ab9b..dc4d758 100644 --- a/src/dico_builder.py +++ b/facebook_muse/dico_builder.py @@ -10,8 +10,7 @@ from .utils import get_nn_avg_dist - -logger = getLogger() +logger = getLogger(__name__) def get_candidates(emb1, emb2, params): diff --git a/src/dictionary.py b/facebook_muse/dictionary.py similarity index 98% rename from src/dictionary.py rename to facebook_muse/dictionary.py index 0dd9033..03236f5 100644 --- a/src/dictionary.py +++ b/facebook_muse/dictionary.py @@ -8,7 +8,7 @@ from logging import getLogger -logger = getLogger() +logger = getLogger(__name__) class Dictionary(object): diff --git a/evaluate.py b/facebook_muse/evaluate.py similarity index 92% rename from evaluate.py rename to facebook_muse/evaluate.py index e49da01..77073e5 100644 --- a/evaluate.py +++ b/facebook_muse/evaluate.py @@ -11,10 +11,10 @@ import argparse from collections import OrderedDict -from src.utils import bool_flag, initialize_exp -from src.models import build_model -from src.trainer import Trainer -from src.evaluation import Evaluator +from facebook_muse.utils import bool_flag, initialize_exp +from facebook_muse.models import build_model +from facebook_muse.trainer import Trainer +from facebook_muse.evaluation import Evaluator # main parser = argparse.ArgumentParser(description='Evaluation') diff --git a/src/evaluation/__init__.py b/facebook_muse/evaluation/__init__.py similarity index 100% rename from src/evaluation/__init__.py rename to facebook_muse/evaluation/__init__.py diff --git a/src/evaluation/evaluator.py b/facebook_muse/evaluation/evaluator.py similarity index 99% rename from src/evaluation/evaluator.py rename to facebook_muse/evaluation/evaluator.py index 53106b6..739c1dc 100644 --- a/src/evaluation/evaluator.py +++ b/facebook_muse/evaluation/evaluator.py @@ -15,10 +15,10 @@ from . import get_word_translation_accuracy from . import load_europarl_data, get_sent_translation_accuracy from ..dico_builder import get_candidates, build_dictionary -from src.utils import get_idf +from facebook_muse.utils import get_idf -logger = getLogger() +logger = getLogger(__name__) class Evaluator(object): diff --git a/src/evaluation/sent_translation.py b/facebook_muse/evaluation/sent_translation.py similarity index 98% rename from src/evaluation/sent_translation.py rename to facebook_muse/evaluation/sent_translation.py index 8809314..cbd2bf8 100644 --- a/src/evaluation/sent_translation.py +++ b/facebook_muse/evaluation/sent_translation.py @@ -11,13 +11,13 @@ import numpy as np import torch -from src.utils import bow_idf, get_nn_avg_dist +from facebook_muse.utils import bow_idf, get_nn_avg_dist EUROPARL_DIR = 'data/crosslingual/europarl' -logger = getLogger() +logger = getLogger(__name__) def load_europarl_data(lg1, lg2, n_max=1e10, lower=True): diff --git a/src/evaluation/word_translation.py b/facebook_muse/evaluation/word_translation.py similarity index 97% rename from src/evaluation/word_translation.py rename to facebook_muse/evaluation/word_translation.py index 346b756..8af7acb 100644 --- a/src/evaluation/word_translation.py +++ b/facebook_muse/evaluation/word_translation.py @@ -14,10 +14,10 @@ from ..utils import get_nn_avg_dist -DIC_EVAL_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'data', 'crosslingual', 'dictionaries') +DIC_EVAL_PATH = os.path.join('data', 'crosslingual', 'dictionaries') -logger = getLogger() +logger = getLogger(__name__) def load_identical_char_dico(word2id1, word2id2): @@ -46,6 +46,7 @@ def load_dictionary(path, word2id1, word2id2): Return a torch tensor of size (n, 2) where n is the size of the loader dictionary, and sort it by source word frequency. """ + print(path) assert os.path.isfile(path) pairs = [] diff --git a/src/evaluation/wordsim.py b/facebook_muse/evaluation/wordsim.py similarity index 99% rename from src/evaluation/wordsim.py rename to facebook_muse/evaluation/wordsim.py index 6a7b22e..4b970a1 100644 --- a/src/evaluation/wordsim.py +++ b/facebook_muse/evaluation/wordsim.py @@ -17,7 +17,7 @@ SEMEVAL17_EVAL_PATH = 'data/crosslingual/wordsim' -logger = getLogger() +logger = getLogger(__name__) def get_word_pairs(path, lower=True): diff --git a/src/logger.py b/facebook_muse/logger.py similarity index 100% rename from src/logger.py rename to facebook_muse/logger.py diff --git a/src/models.py b/facebook_muse/models.py similarity index 100% rename from src/models.py rename to facebook_muse/models.py diff --git a/supervised.py b/facebook_muse/supervised.py similarity index 96% rename from supervised.py rename to facebook_muse/supervised.py index 3675082..e0ed6c8 100644 --- a/supervised.py +++ b/facebook_muse/supervised.py @@ -11,10 +11,10 @@ from collections import OrderedDict import torch -from src.utils import bool_flag, initialize_exp -from src.models import build_model -from src.trainer import Trainer -from src.evaluation import Evaluator +from facebook_muse.utils import bool_flag, initialize_exp +from facebook_muse.models import build_model +from facebook_muse.trainer import Trainer +from facebook_muse.evaluation import Evaluator VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10' diff --git a/src/trainer.py b/facebook_muse/trainer.py similarity index 99% rename from src/trainer.py rename to facebook_muse/trainer.py index dfe2446..7143988 100644 --- a/src/trainer.py +++ b/facebook_muse/trainer.py @@ -19,7 +19,7 @@ from .evaluation.word_translation import DIC_EVAL_PATH, load_identical_char_dico, load_dictionary -logger = getLogger() +logger = getLogger(__name__) class Trainer(object): diff --git a/unsupervised.py b/facebook_muse/unsupervised.py similarity index 97% rename from unsupervised.py rename to facebook_muse/unsupervised.py index 1c8d9cd..dda7fa4 100644 --- a/unsupervised.py +++ b/facebook_muse/unsupervised.py @@ -13,10 +13,10 @@ import numpy as np import torch -from src.utils import bool_flag, initialize_exp -from src.models import build_model -from src.trainer import Trainer -from src.evaluation import Evaluator +from facebook_muse.utils import bool_flag, initialize_exp +from facebook_muse.models import build_model +from facebook_muse.trainer import Trainer +from facebook_muse.evaluation import Evaluator VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000' diff --git a/src/utils.py b/facebook_muse/utils.py similarity index 99% rename from src/utils.py rename to facebook_muse/utils.py index 349c472..66e4e56 100644 --- a/src/utils.py +++ b/facebook_muse/utils.py @@ -25,7 +25,7 @@ MAIN_DUMP_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'dumped') -logger = getLogger() +logger = getLogger(__name__) # load Faiss if available (dramatically accelerates the nearest neighbor search) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c249349 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages +import glob + +def get_bash_scripts(): + return glob.glob("bin/*.sh") + +setup( + name="facebook_muse", + packages=find_packages(), + python_requires=">=2.7", + install_requires=["faiss", + "numpy", + "scipy", + "torch"], + extras_require={ + "dev" : ["jupyter"] + }, + scripts=get_bash_scripts(), + entry_points={ + 'console_scripts' : [ + "supervised=facebook_muse.supervised:main", + "unsupervised=facebook_muse.unsupervised:main", + "evaluation=facebook_muse.evaluation:main" + ] + } +) From 8a1132c03aca8eff0d808670212b683c575c7601 Mon Sep 17 00:00:00 2001 From: John Cadigan Date: Mon, 12 Aug 2019 20:46:02 -0700 Subject: [PATCH 2/2] Adding main func --- facebook_muse/evaluate.py | 88 ++++----- facebook_muse/supervised.py | 183 +++++++++---------- facebook_muse/unsupervised.py | 328 +++++++++++++++++----------------- 3 files changed, 305 insertions(+), 294 deletions(-) diff --git a/facebook_muse/evaluate.py b/facebook_muse/evaluate.py index 77073e5..804b300 100644 --- a/facebook_muse/evaluate.py +++ b/facebook_muse/evaluate.py @@ -17,45 +17,49 @@ from facebook_muse.evaluation import Evaluator # main -parser = argparse.ArgumentParser(description='Evaluation') -parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") -parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") -parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") -parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") -parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") -# data -parser.add_argument("--src_lang", type=str, default="", help="Source language") -parser.add_argument("--tgt_lang", type=str, default="", help="Target language") -parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") -# reload pre-trained embeddings -parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") -parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") -parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") -parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") -parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") - - -# parse parameters -params = parser.parse_args() - -# check parameters -assert params.src_lang, "source language undefined" -assert os.path.isfile(params.src_emb) -assert not params.tgt_lang or os.path.isfile(params.tgt_emb) -assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) - -# build logger / model / trainer / evaluator -logger = initialize_exp(params) -src_emb, tgt_emb, mapping, _ = build_model(params, False) -trainer = Trainer(src_emb, tgt_emb, mapping, None, params) -evaluator = Evaluator(trainer) - -# run evaluations -to_log = OrderedDict({'n_iter': 0}) -evaluator.monolingual_wordsim(to_log) -# evaluator.monolingual_wordanalogy(to_log) -if params.tgt_lang: - evaluator.crosslingual_wordsim(to_log) - evaluator.word_translation(to_log) - evaluator.sent_translation(to_log) - # evaluator.dist_mean_cosine(to_log) +def main(): + parser = argparse.ArgumentParser(description='Evaluation') + parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") + parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") + parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") + parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") + parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") + # data + parser.add_argument("--src_lang", type=str, default="", help="Source language") + parser.add_argument("--tgt_lang", type=str, default="", help="Target language") + parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") + # reload pre-trained embeddings + parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") + parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") + parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") + parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") + parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") + + + # parse parameters + params = parser.parse_args() + + # check parameters + assert params.src_lang, "source language undefined" + assert os.path.isfile(params.src_emb) + assert not params.tgt_lang or os.path.isfile(params.tgt_emb) + assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) + + # build logger / model / trainer / evaluator + logger = initialize_exp(params) + src_emb, tgt_emb, mapping, _ = build_model(params, False) + trainer = Trainer(src_emb, tgt_emb, mapping, None, params) + evaluator = Evaluator(trainer) + + # run evaluations + to_log = OrderedDict({'n_iter': 0}) + evaluator.monolingual_wordsim(to_log) + # evaluator.monolingual_wordanalogy(to_log) + if params.tgt_lang: + evaluator.crosslingual_wordsim(to_log) + evaluator.word_translation(to_log) + evaluator.sent_translation(to_log) + # evaluator.dist_mean_cosine(to_log) + +if __name__ == "__main__": + main() diff --git a/facebook_muse/supervised.py b/facebook_muse/supervised.py index e0ed6c8..ac33558 100644 --- a/facebook_muse/supervised.py +++ b/facebook_muse/supervised.py @@ -20,93 +20,96 @@ VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10' VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000' - -# main -parser = argparse.ArgumentParser(description='Supervised training') -parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") -parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") -parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") -parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") -parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") -parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") -parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") - -# data -parser.add_argument("--src_lang", type=str, default='en', help="Source language") -parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") -parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") -parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") -# training refinement -parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") -# dictionary creation parameters (for refinement) -parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)") -parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") -parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") -parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S") -parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") -parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)") -parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") -parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") -# reload pre-trained embeddings -parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings") -parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings") -parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") - - -# parse parameters -params = parser.parse_args() - -# check parameters -assert not params.cuda or torch.cuda.is_available() -assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) -assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] -assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank -assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size -assert os.path.isfile(params.src_emb) -assert os.path.isfile(params.tgt_emb) -assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) -assert params.export in ["", "txt", "pth"] - -# build logger / model / trainer / evaluator -logger = initialize_exp(params) -src_emb, tgt_emb, mapping, _ = build_model(params, False) -trainer = Trainer(src_emb, tgt_emb, mapping, None, params) -evaluator = Evaluator(trainer) - -# load a training dictionary. if a dictionary path is not provided, use a default -# one ("default") or create one based on identical character strings ("identical_char") -trainer.load_training_dico(params.dico_train) - -# define the validation metric -VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP -logger.info("Validation metric: %s" % VALIDATION_METRIC) - -""" -Learning loop for Procrustes Iterative Learning -""" -for n_iter in range(params.n_refinement + 1): - - logger.info('Starting iteration %i...' % n_iter) - - # build a dictionary from aligned embeddings (unless - # it is the first iteration and we use the init one) - if n_iter > 0 or not hasattr(trainer, 'dico'): - trainer.build_dictionary() - - # apply the Procrustes solution - trainer.procrustes() - - # embeddings evaluation - to_log = OrderedDict({'n_iter': n_iter}) - evaluator.all_eval(to_log) - - # JSON log / save best model / end of epoch - logger.info("__log__:%s" % json.dumps(to_log)) - trainer.save_best(to_log, VALIDATION_METRIC) - logger.info('End of iteration %i.\n\n' % n_iter) - - -# export embeddings -if params.export: - trainer.reload_best() - trainer.export() +def main(): + # main + parser = argparse.ArgumentParser(description='Supervised training') + parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") + parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") + parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") + parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") + parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") + parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") + parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") + + # data + parser.add_argument("--src_lang", type=str, default='en', help="Source language") + parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") + parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") + parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") + # training refinement + parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") + # dictionary creation parameters (for refinement) + parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)") + parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") + parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") + parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S") + parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") + parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)") + parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") + parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") + # reload pre-trained embeddings + parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings") + parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings") + parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") + + + # parse parameters + params = parser.parse_args() + + # check parameters + assert not params.cuda or torch.cuda.is_available() + assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) + assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] + assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank + assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size + assert os.path.isfile(params.src_emb) + assert os.path.isfile(params.tgt_emb) + assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) + assert params.export in ["", "txt", "pth"] + + # build logger / model / trainer / evaluator + logger = initialize_exp(params) + src_emb, tgt_emb, mapping, _ = build_model(params, False) + trainer = Trainer(src_emb, tgt_emb, mapping, None, params) + evaluator = Evaluator(trainer) + + # load a training dictionary. if a dictionary path is not provided, use a default + # one ("default") or create one based on identical character strings ("identical_char") + trainer.load_training_dico(params.dico_train) + + # define the validation metric + VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP + logger.info("Validation metric: %s" % VALIDATION_METRIC) + + """ + Learning loop for Procrustes Iterative Learning + """ + for n_iter in range(params.n_refinement + 1): + + logger.info('Starting iteration %i...' % n_iter) + + # build a dictionary from aligned embeddings (unless + # it is the first iteration and we use the init one) + if n_iter > 0 or not hasattr(trainer, 'dico'): + trainer.build_dictionary() + + # apply the Procrustes solution + trainer.procrustes() + + # embeddings evaluation + to_log = OrderedDict({'n_iter': n_iter}) + evaluator.all_eval(to_log) + + # JSON log / save best model / end of epoch + logger.info("__log__:%s" % json.dumps(to_log)) + trainer.save_best(to_log, VALIDATION_METRIC) + logger.info('End of iteration %i.\n\n' % n_iter) + + + # export embeddings + if params.export: + trainer.reload_best() + trainer.export() + +if __name__ == "__main__": + main() diff --git a/facebook_muse/unsupervised.py b/facebook_muse/unsupervised.py index dda7fa4..c5b7a4a 100644 --- a/facebook_muse/unsupervised.py +++ b/facebook_muse/unsupervised.py @@ -22,165 +22,169 @@ VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000' -# main -parser = argparse.ArgumentParser(description='Unsupervised training') -parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") -parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") -parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") -parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") -parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") -parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") -parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") -# data -parser.add_argument("--src_lang", type=str, default='en', help="Source language") -parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") -parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") -parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") -# mapping -parser.add_argument("--map_id_init", type=bool_flag, default=True, help="Initialize the mapping as an identity matrix") -parser.add_argument("--map_beta", type=float, default=0.001, help="Beta for orthogonalization") -# discriminator -parser.add_argument("--dis_layers", type=int, default=2, help="Discriminator layers") -parser.add_argument("--dis_hid_dim", type=int, default=2048, help="Discriminator hidden layer dimensions") -parser.add_argument("--dis_dropout", type=float, default=0., help="Discriminator dropout") -parser.add_argument("--dis_input_dropout", type=float, default=0.1, help="Discriminator input dropout") -parser.add_argument("--dis_steps", type=int, default=5, help="Discriminator steps") -parser.add_argument("--dis_lambda", type=float, default=1, help="Discriminator loss feedback coefficient") -parser.add_argument("--dis_most_frequent", type=int, default=75000, help="Select embeddings of the k most frequent words for discrimination (0 to disable)") -parser.add_argument("--dis_smooth", type=float, default=0.1, help="Discriminator smooth predictions") -parser.add_argument("--dis_clip_weights", type=float, default=0, help="Clip discriminator weights (0 to disable)") -# training adversarial -parser.add_argument("--adversarial", type=bool_flag, default=True, help="Use adversarial training") -parser.add_argument("--n_epochs", type=int, default=5, help="Number of epochs") -parser.add_argument("--epoch_size", type=int, default=1000000, help="Iterations per epoch") -parser.add_argument("--batch_size", type=int, default=32, help="Batch size") -parser.add_argument("--map_optimizer", type=str, default="sgd,lr=0.1", help="Mapping optimizer") -parser.add_argument("--dis_optimizer", type=str, default="sgd,lr=0.1", help="Discriminator optimizer") -parser.add_argument("--lr_decay", type=float, default=0.98, help="Learning rate decay (SGD only)") -parser.add_argument("--min_lr", type=float, default=1e-6, help="Minimum learning rate (SGD only)") -parser.add_argument("--lr_shrink", type=float, default=0.5, help="Shrink the learning rate if the validation metric decreases (1 to disable)") -# training refinement -parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") -# dictionary creation parameters (for refinement) -parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") -parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") -parser.add_argument("--dico_build", type=str, default='S2T', help="S2T,T2S,S2T|T2S,S2T&T2S") -parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") -parser.add_argument("--dico_max_rank", type=int, default=15000, help="Maximum dictionary words rank (0 to disable)") -parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") -parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") -# reload pre-trained embeddings -parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") -parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") -parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") - - -# parse parameters -params = parser.parse_args() - -# check parameters -assert not params.cuda or torch.cuda.is_available() -assert 0 <= params.dis_dropout < 1 -assert 0 <= params.dis_input_dropout < 1 -assert 0 <= params.dis_smooth < 0.5 -assert params.dis_lambda > 0 and params.dis_steps > 0 -assert 0 < params.lr_shrink <= 1 -assert os.path.isfile(params.src_emb) -assert os.path.isfile(params.tgt_emb) -assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) -assert params.export in ["", "txt", "pth"] - -# build model / trainer / evaluator -logger = initialize_exp(params) -src_emb, tgt_emb, mapping, discriminator = build_model(params, True) -trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) -evaluator = Evaluator(trainer) - - -""" -Learning loop for Adversarial Training -""" -if params.adversarial: - logger.info('----> ADVERSARIAL TRAINING <----\n\n') - - # training loop - for n_epoch in range(params.n_epochs): - - logger.info('Starting adversarial training epoch %i...' % n_epoch) - tic = time.time() - n_words_proc = 0 - stats = {'DIS_COSTS': []} - - for n_iter in range(0, params.epoch_size, params.batch_size): - - # discriminator training - for _ in range(params.dis_steps): - trainer.dis_step(stats) - - # mapping training (discriminator fooling) - n_words_proc += trainer.mapping_step(stats) - - # log stats - if n_iter % 500 == 0: - stats_str = [('DIS_COSTS', 'Discriminator loss')] - stats_log = ['%s: %.4f' % (v, np.mean(stats[k])) - for k, v in stats_str if len(stats[k]) > 0] - stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) - logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) - - # reset - tic = time.time() - n_words_proc = 0 - for k, _ in stats_str: - del stats[k][:] - - # embeddings / discriminator evaluation - to_log = OrderedDict({'n_epoch': n_epoch}) - evaluator.all_eval(to_log) - evaluator.eval_dis(to_log) - - # JSON log / save best model / end of epoch - logger.info("__log__:%s" % json.dumps(to_log)) - trainer.save_best(to_log, VALIDATION_METRIC) - logger.info('End of epoch %i.\n\n' % n_epoch) - - # update the learning rate (stop if too small) - trainer.update_lr(to_log, VALIDATION_METRIC) - if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: - logger.info('Learning rate < 1e-6. BREAK.') - break - - -""" -Learning loop for Procrustes Iterative Refinement -""" -if params.n_refinement > 0: - # Get the best mapping according to VALIDATION_METRIC - logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n') - trainer.reload_best() - - # training loop - for n_iter in range(params.n_refinement): - - logger.info('Starting refinement iteration %i...' % n_iter) - - # build a dictionary from aligned embeddings - trainer.build_dictionary() - - # apply the Procrustes solution - trainer.procrustes() - - # embeddings evaluation - to_log = OrderedDict({'n_iter': n_iter}) - evaluator.all_eval(to_log) - - # JSON log / save best model / end of epoch - logger.info("__log__:%s" % json.dumps(to_log)) - trainer.save_best(to_log, VALIDATION_METRIC) - logger.info('End of refinement iteration %i.\n\n' % n_iter) - - -# export embeddings -if params.export: - trainer.reload_best() - trainer.export() +def main(): + # main + parser = argparse.ArgumentParser(description='Unsupervised training') + parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") + parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") + parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") + parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") + parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") + parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") + parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") + # data + parser.add_argument("--src_lang", type=str, default='en', help="Source language") + parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") + parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") + parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") + # mapping + parser.add_argument("--map_id_init", type=bool_flag, default=True, help="Initialize the mapping as an identity matrix") + parser.add_argument("--map_beta", type=float, default=0.001, help="Beta for orthogonalization") + # discriminator + parser.add_argument("--dis_layers", type=int, default=2, help="Discriminator layers") + parser.add_argument("--dis_hid_dim", type=int, default=2048, help="Discriminator hidden layer dimensions") + parser.add_argument("--dis_dropout", type=float, default=0., help="Discriminator dropout") + parser.add_argument("--dis_input_dropout", type=float, default=0.1, help="Discriminator input dropout") + parser.add_argument("--dis_steps", type=int, default=5, help="Discriminator steps") + parser.add_argument("--dis_lambda", type=float, default=1, help="Discriminator loss feedback coefficient") + parser.add_argument("--dis_most_frequent", type=int, default=75000, help="Select embeddings of the k most frequent words for discrimination (0 to disable)") + parser.add_argument("--dis_smooth", type=float, default=0.1, help="Discriminator smooth predictions") + parser.add_argument("--dis_clip_weights", type=float, default=0, help="Clip discriminator weights (0 to disable)") + # training adversarial + parser.add_argument("--adversarial", type=bool_flag, default=True, help="Use adversarial training") + parser.add_argument("--n_epochs", type=int, default=5, help="Number of epochs") + parser.add_argument("--epoch_size", type=int, default=1000000, help="Iterations per epoch") + parser.add_argument("--batch_size", type=int, default=32, help="Batch size") + parser.add_argument("--map_optimizer", type=str, default="sgd,lr=0.1", help="Mapping optimizer") + parser.add_argument("--dis_optimizer", type=str, default="sgd,lr=0.1", help="Discriminator optimizer") + parser.add_argument("--lr_decay", type=float, default=0.98, help="Learning rate decay (SGD only)") + parser.add_argument("--min_lr", type=float, default=1e-6, help="Minimum learning rate (SGD only)") + parser.add_argument("--lr_shrink", type=float, default=0.5, help="Shrink the learning rate if the validation metric decreases (1 to disable)") + # training refinement + parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") + # dictionary creation parameters (for refinement) + parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") + parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") + parser.add_argument("--dico_build", type=str, default='S2T', help="S2T,T2S,S2T|T2S,S2T&T2S") + parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") + parser.add_argument("--dico_max_rank", type=int, default=15000, help="Maximum dictionary words rank (0 to disable)") + parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") + parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") + # reload pre-trained embeddings + parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") + parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") + parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") + + + # parse parameters + params = parser.parse_args() + + # check parameters + assert not params.cuda or torch.cuda.is_available() + assert 0 <= params.dis_dropout < 1 + assert 0 <= params.dis_input_dropout < 1 + assert 0 <= params.dis_smooth < 0.5 + assert params.dis_lambda > 0 and params.dis_steps > 0 + assert 0 < params.lr_shrink <= 1 + assert os.path.isfile(params.src_emb) + assert os.path.isfile(params.tgt_emb) + assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) + assert params.export in ["", "txt", "pth"] + + # build model / trainer / evaluator + logger = initialize_exp(params) + src_emb, tgt_emb, mapping, discriminator = build_model(params, True) + trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) + evaluator = Evaluator(trainer) + + + """ + Learning loop for Adversarial Training + """ + if params.adversarial: + logger.info('----> ADVERSARIAL TRAINING <----\n\n') + + # training loop + for n_epoch in range(params.n_epochs): + + logger.info('Starting adversarial training epoch %i...' % n_epoch) + tic = time.time() + n_words_proc = 0 + stats = {'DIS_COSTS': []} + + for n_iter in range(0, params.epoch_size, params.batch_size): + + # discriminator training + for _ in range(params.dis_steps): + trainer.dis_step(stats) + + # mapping training (discriminator fooling) + n_words_proc += trainer.mapping_step(stats) + + # log stats + if n_iter % 500 == 0: + stats_str = [('DIS_COSTS', 'Discriminator loss')] + stats_log = ['%s: %.4f' % (v, np.mean(stats[k])) + for k, v in stats_str if len(stats[k]) > 0] + stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) + logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) + + # reset + tic = time.time() + n_words_proc = 0 + for k, _ in stats_str: + del stats[k][:] + + # embeddings / discriminator evaluation + to_log = OrderedDict({'n_epoch': n_epoch}) + evaluator.all_eval(to_log) + evaluator.eval_dis(to_log) + + # JSON log / save best model / end of epoch + logger.info("__log__:%s" % json.dumps(to_log)) + trainer.save_best(to_log, VALIDATION_METRIC) + logger.info('End of epoch %i.\n\n' % n_epoch) + + # update the learning rate (stop if too small) + trainer.update_lr(to_log, VALIDATION_METRIC) + if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: + logger.info('Learning rate < 1e-6. BREAK.') + break + + + """ + Learning loop for Procrustes Iterative Refinement + """ + if params.n_refinement > 0: + # Get the best mapping according to VALIDATION_METRIC + logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n') + trainer.reload_best() + + # training loop + for n_iter in range(params.n_refinement): + + logger.info('Starting refinement iteration %i...' % n_iter) + + # build a dictionary from aligned embeddings + trainer.build_dictionary() + + # apply the Procrustes solution + trainer.procrustes() + + # embeddings evaluation + to_log = OrderedDict({'n_iter': n_iter}) + evaluator.all_eval(to_log) + + # JSON log / save best model / end of epoch + logger.info("__log__:%s" % json.dumps(to_log)) + trainer.save_best(to_log, VALIDATION_METRIC) + logger.info('End of refinement iteration %i.\n\n' % n_iter) + + + # export embeddings + if params.export: + trainer.reload_best() + trainer.export() + +if __name__ == "__main__": + main()