From 8bc7634ba6a1463f4b20583c5fa8cb46fda3e071 Mon Sep 17 00:00:00 2001 From: delihiros Date: Thu, 5 Jan 2017 18:03:52 +0900 Subject: [PATCH 1/2] add option unk_frequency to adjust vocabulary size. --- sample_data/tiny_config.ini | 3 ++- src/bin/train.cc | 8 ++++++-- src/include/nmtkit/character_vocabulary.h | 2 +- src/include/nmtkit/word_vocabulary.h | 3 ++- src/lib/character_vocabulary.cc | 12 ++++++++++-- src/lib/word_vocabulary.cc | 17 +++++++++++++---- 6 files changed, 34 insertions(+), 11 deletions(-) diff --git a/sample_data/tiny_config.ini b/sample_data/tiny_config.ini index e2b978d..6082a4c 100644 --- a/sample_data/tiny_config.ini +++ b/sample_data/tiny_config.ini @@ -17,7 +17,8 @@ test_target=sample_data/tiny.out [Model] source_vocabulary_type=word target_vocabulary_type=word -source_vocabulary_size=30 +unk_frequency=0 +source_vocabulary_size=33 target_vocabulary_size=33 encoder_type=bidirectional decoder_type=default diff --git a/src/bin/train.cc b/src/bin/train.cc index 06f8e61..2e7fcec 100644 --- a/src/bin/train.cc +++ b/src/bin/train.cc @@ -172,6 +172,7 @@ void initializeLogger( // Arguments: // corpus_filepath: Location of the corpus file to be analyzed. // vocab_type: Name of the vocabulary type. +// unk_frequency: Frequency of unknown word. // vocab_size: Number of entries in the vocabulary. // // Returns: @@ -183,11 +184,12 @@ void initializeLogger( nmtkit::Vocabulary * createVocabulary( const string & corpus_filepath, const string & vocab_type, + const unsigned unk_frequency, const unsigned vocab_size) { if (vocab_type == "character") { - return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size); + return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size); } else if (vocab_type == "word") { - return new nmtkit::WordVocabulary(corpus_filepath, vocab_size); + return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size); } NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type); } @@ -370,11 +372,13 @@ int main(int argc, char * argv[]) { ::createVocabulary( config.get("Corpus.train_source"), config.get("Model.source_vocabulary_type"), + config.get("Model.unk_frequency"), config.get("Model.source_vocabulary_size"))); boost::scoped_ptr trg_vocab( ::createVocabulary( config.get("Corpus.train_target"), config.get("Model.target_vocabulary_type"), + config.get("Model.unk_frequency"), config.get("Model.target_vocabulary_size"))); ::saveArchive(model_dir / "source.vocab", archive_format, src_vocab); ::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab); diff --git a/src/include/nmtkit/character_vocabulary.h b/src/include/nmtkit/character_vocabulary.h index b1554ea..bb9e128 100644 --- a/src/include/nmtkit/character_vocabulary.h +++ b/src/include/nmtkit/character_vocabulary.h @@ -32,7 +32,7 @@ class CharacterVocabulary : public Vocabulary { // Arguments: // corpus_filename: Location of the corpus file to be analyzed. // size: Size of the vocabulary. - CharacterVocabulary(const std::string & corpus_filename, unsigned size); + CharacterVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size); ~CharacterVocabulary() override {} diff --git a/src/include/nmtkit/word_vocabulary.h b/src/include/nmtkit/word_vocabulary.h index e4ca18e..98862d0 100644 --- a/src/include/nmtkit/word_vocabulary.h +++ b/src/include/nmtkit/word_vocabulary.h @@ -31,8 +31,9 @@ class WordVocabulary : public Vocabulary { // // Arguments: // corpus_filename: Location of the corpus file to be analyzed. + // unk_frequency: A parameter to which word will be treated as unk. // size: Size of the vocabulary. - WordVocabulary(const std::string & corpus_filename, unsigned size); + WordVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size); ~WordVocabulary() override {} diff --git a/src/lib/character_vocabulary.cc b/src/lib/character_vocabulary.cc index d91d5ce..1aac5e4 100644 --- a/src/lib/character_vocabulary.cc +++ b/src/lib/character_vocabulary.cc @@ -50,8 +50,10 @@ namespace nmtkit { CharacterVocabulary::CharacterVocabulary( const string & corpus_filename, + unsigned unk_frequency, unsigned size) { - NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); + NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); + NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); ifstream ifs(corpus_filename); NMTKIT_CHECK( ifs.is_open(), @@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary( freq_.emplace_back(num_letters); freq_.emplace_back(num_lines); freq_.emplace_back(num_lines); - for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { + + unsigned longest_size = size; + if (unk_frequency > 0) { + longest_size = entries.size(); + } + + for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { const auto & entry = entries[i - 3]; stoi_[entry.second] = i; itos_.emplace_back(entry.second); diff --git a/src/lib/word_vocabulary.cc b/src/lib/word_vocabulary.cc index e57230f..85a6333 100644 --- a/src/lib/word_vocabulary.cc +++ b/src/lib/word_vocabulary.cc @@ -13,8 +13,9 @@ using namespace std; namespace nmtkit { -WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { - NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); +WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) { + NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); + NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); ifstream ifs(corpus_filename); NMTKIT_CHECK( ifs.is_open(), @@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { // Selects most frequent words. vector> entries; for (const auto & entry : freq) { - entries.emplace_back(make_pair(entry.second, entry.first)); + entries.emplace_back(make_pair(entry.second, entry.first)); } Array::sort(&entries, greater>()); @@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { freq_.emplace_back(num_words); freq_.emplace_back(num_lines); freq_.emplace_back(num_lines); - for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { + + unsigned longest_size = size; + if (unk_frequency > 0) { + longest_size = entries.size(); + } + + for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { const auto & entry = entries[i - 3]; stoi_[entry.second] = i; itos_.emplace_back(entry.second); freq_.emplace_back(entry.first); freq_[0] -= entry.first; } + + } unsigned WordVocabulary::getID(const string & word) const { From 94b361f8592cdcd6ce17191f03f42f5f9f81d40e Mon Sep 17 00:00:00 2001 From: delihiros Date: Thu, 5 Jan 2017 18:03:52 +0900 Subject: [PATCH 2/2] add option unk_frequency to adjust vocabulary size. --- sample_data/sample_config.ini | 1 + sample_data/tiny_config.ini | 3 ++- src/bin/train.cc | 8 ++++++-- src/include/nmtkit/character_vocabulary.h | 2 +- src/include/nmtkit/word_vocabulary.h | 3 ++- src/lib/character_vocabulary.cc | 12 ++++++++++-- src/lib/word_vocabulary.cc | 17 +++++++++++++---- 7 files changed, 35 insertions(+), 11 deletions(-) diff --git a/sample_data/sample_config.ini b/sample_data/sample_config.ini index 8cbf865..40fc946 100644 --- a/sample_data/sample_config.ini +++ b/sample_data/sample_config.ini @@ -52,6 +52,7 @@ source_vocabulary_type=word target_vocabulary_type=word ; Vocabulary size in each side. +unk_frequency=0 source_vocabulary_size=4100 target_vocabulary_size=4900 diff --git a/sample_data/tiny_config.ini b/sample_data/tiny_config.ini index e2b978d..6082a4c 100644 --- a/sample_data/tiny_config.ini +++ b/sample_data/tiny_config.ini @@ -17,7 +17,8 @@ test_target=sample_data/tiny.out [Model] source_vocabulary_type=word target_vocabulary_type=word -source_vocabulary_size=30 +unk_frequency=0 +source_vocabulary_size=33 target_vocabulary_size=33 encoder_type=bidirectional decoder_type=default diff --git a/src/bin/train.cc b/src/bin/train.cc index 06f8e61..2e7fcec 100644 --- a/src/bin/train.cc +++ b/src/bin/train.cc @@ -172,6 +172,7 @@ void initializeLogger( // Arguments: // corpus_filepath: Location of the corpus file to be analyzed. // vocab_type: Name of the vocabulary type. +// unk_frequency: Frequency of unknown word. // vocab_size: Number of entries in the vocabulary. // // Returns: @@ -183,11 +184,12 @@ void initializeLogger( nmtkit::Vocabulary * createVocabulary( const string & corpus_filepath, const string & vocab_type, + const unsigned unk_frequency, const unsigned vocab_size) { if (vocab_type == "character") { - return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size); + return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size); } else if (vocab_type == "word") { - return new nmtkit::WordVocabulary(corpus_filepath, vocab_size); + return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size); } NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type); } @@ -370,11 +372,13 @@ int main(int argc, char * argv[]) { ::createVocabulary( config.get("Corpus.train_source"), config.get("Model.source_vocabulary_type"), + config.get("Model.unk_frequency"), config.get("Model.source_vocabulary_size"))); boost::scoped_ptr trg_vocab( ::createVocabulary( config.get("Corpus.train_target"), config.get("Model.target_vocabulary_type"), + config.get("Model.unk_frequency"), config.get("Model.target_vocabulary_size"))); ::saveArchive(model_dir / "source.vocab", archive_format, src_vocab); ::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab); diff --git a/src/include/nmtkit/character_vocabulary.h b/src/include/nmtkit/character_vocabulary.h index b1554ea..bb9e128 100644 --- a/src/include/nmtkit/character_vocabulary.h +++ b/src/include/nmtkit/character_vocabulary.h @@ -32,7 +32,7 @@ class CharacterVocabulary : public Vocabulary { // Arguments: // corpus_filename: Location of the corpus file to be analyzed. // size: Size of the vocabulary. - CharacterVocabulary(const std::string & corpus_filename, unsigned size); + CharacterVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size); ~CharacterVocabulary() override {} diff --git a/src/include/nmtkit/word_vocabulary.h b/src/include/nmtkit/word_vocabulary.h index e4ca18e..98862d0 100644 --- a/src/include/nmtkit/word_vocabulary.h +++ b/src/include/nmtkit/word_vocabulary.h @@ -31,8 +31,9 @@ class WordVocabulary : public Vocabulary { // // Arguments: // corpus_filename: Location of the corpus file to be analyzed. + // unk_frequency: A parameter to which word will be treated as unk. // size: Size of the vocabulary. - WordVocabulary(const std::string & corpus_filename, unsigned size); + WordVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size); ~WordVocabulary() override {} diff --git a/src/lib/character_vocabulary.cc b/src/lib/character_vocabulary.cc index d91d5ce..1aac5e4 100644 --- a/src/lib/character_vocabulary.cc +++ b/src/lib/character_vocabulary.cc @@ -50,8 +50,10 @@ namespace nmtkit { CharacterVocabulary::CharacterVocabulary( const string & corpus_filename, + unsigned unk_frequency, unsigned size) { - NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); + NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); + NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); ifstream ifs(corpus_filename); NMTKIT_CHECK( ifs.is_open(), @@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary( freq_.emplace_back(num_letters); freq_.emplace_back(num_lines); freq_.emplace_back(num_lines); - for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { + + unsigned longest_size = size; + if (unk_frequency > 0) { + longest_size = entries.size(); + } + + for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { const auto & entry = entries[i - 3]; stoi_[entry.second] = i; itos_.emplace_back(entry.second); diff --git a/src/lib/word_vocabulary.cc b/src/lib/word_vocabulary.cc index e57230f..85a6333 100644 --- a/src/lib/word_vocabulary.cc +++ b/src/lib/word_vocabulary.cc @@ -13,8 +13,9 @@ using namespace std; namespace nmtkit { -WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { - NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); +WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) { + NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); + NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); ifstream ifs(corpus_filename); NMTKIT_CHECK( ifs.is_open(), @@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { // Selects most frequent words. vector> entries; for (const auto & entry : freq) { - entries.emplace_back(make_pair(entry.second, entry.first)); + entries.emplace_back(make_pair(entry.second, entry.first)); } Array::sort(&entries, greater>()); @@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { freq_.emplace_back(num_words); freq_.emplace_back(num_lines); freq_.emplace_back(num_lines); - for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { + + unsigned longest_size = size; + if (unk_frequency > 0) { + longest_size = entries.size(); + } + + for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { const auto & entry = entries[i - 3]; stoi_[entry.second] = i; itos_.emplace_back(entry.second); freq_.emplace_back(entry.first); freq_[0] -= entry.first; } + + } unsigned WordVocabulary::getID(const string & word) const {