From 8bc7634ba6a1463f4b20583c5fa8cb46fda3e071 Mon Sep 17 00:00:00 2001
From: delihiros <delihiros@gmail.com>
Date: Thu, 5 Jan 2017 18:03:52 +0900
Subject: [PATCH 1/2] add option unk_frequency to adjust vocabulary size.

---
 sample_data/tiny_config.ini               |  3 ++-
 src/bin/train.cc                          |  8 ++++++--
 src/include/nmtkit/character_vocabulary.h |  2 +-
 src/include/nmtkit/word_vocabulary.h      |  3 ++-
 src/lib/character_vocabulary.cc           | 12 ++++++++++--
 src/lib/word_vocabulary.cc                | 17 +++++++++++++----
 6 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/sample_data/tiny_config.ini b/sample_data/tiny_config.ini
index e2b978d..6082a4c 100644
--- a/sample_data/tiny_config.ini
+++ b/sample_data/tiny_config.ini
@@ -17,7 +17,8 @@ test_target=sample_data/tiny.out
 [Model]
 source_vocabulary_type=word
 target_vocabulary_type=word
-source_vocabulary_size=30
+unk_frequency=0
+source_vocabulary_size=33
 target_vocabulary_size=33
 encoder_type=bidirectional
 decoder_type=default
diff --git a/src/bin/train.cc b/src/bin/train.cc
index 06f8e61..2e7fcec 100644
--- a/src/bin/train.cc
+++ b/src/bin/train.cc
@@ -172,6 +172,7 @@ void initializeLogger(
 // Arguments:
 //   corpus_filepath: Location of the corpus file to be analyzed.
 //   vocab_type: Name of the vocabulary type.
+//   unk_frequency: Frequency of unknown word.
 //   vocab_size: Number of entries in the vocabulary.
 //
 // Returns:
@@ -183,11 +184,12 @@ void initializeLogger(
 nmtkit::Vocabulary * createVocabulary(
     const string & corpus_filepath,
     const string & vocab_type,
+    const unsigned unk_frequency,
     const unsigned vocab_size) {
   if (vocab_type == "character") {
-    return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size);
+    return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size);
   } else if (vocab_type == "word") {
-    return new nmtkit::WordVocabulary(corpus_filepath, vocab_size);
+    return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size);
   }
   NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type);
 }
@@ -370,11 +372,13 @@ int main(int argc, char * argv[]) {
         ::createVocabulary(
             config.get<string>("Corpus.train_source"),
             config.get<string>("Model.source_vocabulary_type"),
+            config.get<unsigned>("Model.unk_frequency"),
             config.get<unsigned>("Model.source_vocabulary_size")));
     boost::scoped_ptr<nmtkit::Vocabulary> trg_vocab(
         ::createVocabulary(
             config.get<string>("Corpus.train_target"),
             config.get<string>("Model.target_vocabulary_type"),
+            config.get<unsigned>("Model.unk_frequency"),
             config.get<unsigned>("Model.target_vocabulary_size")));
     ::saveArchive(model_dir / "source.vocab", archive_format, src_vocab);
     ::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab);
diff --git a/src/include/nmtkit/character_vocabulary.h b/src/include/nmtkit/character_vocabulary.h
index b1554ea..bb9e128 100644
--- a/src/include/nmtkit/character_vocabulary.h
+++ b/src/include/nmtkit/character_vocabulary.h
@@ -32,7 +32,7 @@ class CharacterVocabulary : public Vocabulary {
   // Arguments:
   //   corpus_filename: Location of the corpus file to be analyzed.
   //   size: Size of the vocabulary.
-  CharacterVocabulary(const std::string & corpus_filename, unsigned size);
+  CharacterVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);
 
   ~CharacterVocabulary() override {}
 
diff --git a/src/include/nmtkit/word_vocabulary.h b/src/include/nmtkit/word_vocabulary.h
index e4ca18e..98862d0 100644
--- a/src/include/nmtkit/word_vocabulary.h
+++ b/src/include/nmtkit/word_vocabulary.h
@@ -31,8 +31,9 @@ class WordVocabulary : public Vocabulary {
   //
   // Arguments:
   //   corpus_filename: Location of the corpus file to be analyzed.
+  //   unk_frequency: A parameter to which word will be treated as unk.
   //   size: Size of the vocabulary.
-  WordVocabulary(const std::string & corpus_filename, unsigned size);
+  WordVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);
 
   ~WordVocabulary() override {}
 
diff --git a/src/lib/character_vocabulary.cc b/src/lib/character_vocabulary.cc
index d91d5ce..1aac5e4 100644
--- a/src/lib/character_vocabulary.cc
+++ b/src/lib/character_vocabulary.cc
@@ -50,8 +50,10 @@ namespace nmtkit {
 
 CharacterVocabulary::CharacterVocabulary(
     const string & corpus_filename,
+    unsigned unk_frequency,
     unsigned size) {
-  NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
+  NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
+  NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
   ifstream ifs(corpus_filename);
   NMTKIT_CHECK(
       ifs.is_open(),
@@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary(
   freq_.emplace_back(num_letters);
   freq_.emplace_back(num_lines);
   freq_.emplace_back(num_lines);
-  for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {
+
+  unsigned longest_size = size;
+  if (unk_frequency > 0) {
+    longest_size = entries.size();
+  }
+
+  for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
     const auto & entry = entries[i - 3];
     stoi_[entry.second] = i;
     itos_.emplace_back(entry.second);
diff --git a/src/lib/word_vocabulary.cc b/src/lib/word_vocabulary.cc
index e57230f..85a6333 100644
--- a/src/lib/word_vocabulary.cc
+++ b/src/lib/word_vocabulary.cc
@@ -13,8 +13,9 @@ using namespace std;
 
 namespace nmtkit {
 
-WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
-  NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
+WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) {
+  NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
+  NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
   ifstream ifs(corpus_filename);
   NMTKIT_CHECK(
       ifs.is_open(),
@@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
   // Selects most frequent words.
   vector<pair<unsigned, string>> entries;
   for (const auto & entry : freq) {
-    entries.emplace_back(make_pair(entry.second, entry.first));
+      entries.emplace_back(make_pair(entry.second, entry.first));
   }
   Array::sort(&entries, greater<pair<unsigned, string>>());
 
@@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
   freq_.emplace_back(num_words);
   freq_.emplace_back(num_lines);
   freq_.emplace_back(num_lines);
-  for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {
+
+  unsigned longest_size = size;
+  if (unk_frequency > 0) {
+    longest_size = entries.size();
+  }
+
+  for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
     const auto & entry = entries[i - 3];
     stoi_[entry.second] = i;
     itos_.emplace_back(entry.second);
     freq_.emplace_back(entry.first);
     freq_[0] -= entry.first;
   }
+
+
 }
 
 unsigned WordVocabulary::getID(const string & word) const {

From 94b361f8592cdcd6ce17191f03f42f5f9f81d40e Mon Sep 17 00:00:00 2001
From: delihiros <delihiros@gmail.com>
Date: Thu, 5 Jan 2017 18:03:52 +0900
Subject: [PATCH 2/2] add option unk_frequency to adjust vocabulary size.

---
 sample_data/sample_config.ini             |  1 +
 sample_data/tiny_config.ini               |  3 ++-
 src/bin/train.cc                          |  8 ++++++--
 src/include/nmtkit/character_vocabulary.h |  2 +-
 src/include/nmtkit/word_vocabulary.h      |  3 ++-
 src/lib/character_vocabulary.cc           | 12 ++++++++++--
 src/lib/word_vocabulary.cc                | 17 +++++++++++++----
 7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/sample_data/sample_config.ini b/sample_data/sample_config.ini
index 8cbf865..40fc946 100644
--- a/sample_data/sample_config.ini
+++ b/sample_data/sample_config.ini
@@ -52,6 +52,7 @@ source_vocabulary_type=word
 target_vocabulary_type=word
 
 ; Vocabulary size in each side.
+unk_frequency=0
 source_vocabulary_size=4100
 target_vocabulary_size=4900
 
diff --git a/sample_data/tiny_config.ini b/sample_data/tiny_config.ini
index e2b978d..6082a4c 100644
--- a/sample_data/tiny_config.ini
+++ b/sample_data/tiny_config.ini
@@ -17,7 +17,8 @@ test_target=sample_data/tiny.out
 [Model]
 source_vocabulary_type=word
 target_vocabulary_type=word
-source_vocabulary_size=30
+unk_frequency=0
+source_vocabulary_size=33
 target_vocabulary_size=33
 encoder_type=bidirectional
 decoder_type=default
diff --git a/src/bin/train.cc b/src/bin/train.cc
index 06f8e61..2e7fcec 100644
--- a/src/bin/train.cc
+++ b/src/bin/train.cc
@@ -172,6 +172,7 @@ void initializeLogger(
 // Arguments:
 //   corpus_filepath: Location of the corpus file to be analyzed.
 //   vocab_type: Name of the vocabulary type.
+//   unk_frequency: Frequency of unknown word.
 //   vocab_size: Number of entries in the vocabulary.
 //
 // Returns:
@@ -183,11 +184,12 @@ void initializeLogger(
 nmtkit::Vocabulary * createVocabulary(
     const string & corpus_filepath,
     const string & vocab_type,
+    const unsigned unk_frequency,
     const unsigned vocab_size) {
   if (vocab_type == "character") {
-    return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size);
+    return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size);
   } else if (vocab_type == "word") {
-    return new nmtkit::WordVocabulary(corpus_filepath, vocab_size);
+    return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size);
   }
   NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type);
 }
@@ -370,11 +372,13 @@ int main(int argc, char * argv[]) {
         ::createVocabulary(
             config.get<string>("Corpus.train_source"),
             config.get<string>("Model.source_vocabulary_type"),
+            config.get<unsigned>("Model.unk_frequency"),
             config.get<unsigned>("Model.source_vocabulary_size")));
     boost::scoped_ptr<nmtkit::Vocabulary> trg_vocab(
         ::createVocabulary(
             config.get<string>("Corpus.train_target"),
             config.get<string>("Model.target_vocabulary_type"),
+            config.get<unsigned>("Model.unk_frequency"),
             config.get<unsigned>("Model.target_vocabulary_size")));
     ::saveArchive(model_dir / "source.vocab", archive_format, src_vocab);
     ::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab);
diff --git a/src/include/nmtkit/character_vocabulary.h b/src/include/nmtkit/character_vocabulary.h
index b1554ea..bb9e128 100644
--- a/src/include/nmtkit/character_vocabulary.h
+++ b/src/include/nmtkit/character_vocabulary.h
@@ -32,7 +32,7 @@ class CharacterVocabulary : public Vocabulary {
   // Arguments:
   //   corpus_filename: Location of the corpus file to be analyzed.
   //   size: Size of the vocabulary.
-  CharacterVocabulary(const std::string & corpus_filename, unsigned size);
+  CharacterVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);
 
   ~CharacterVocabulary() override {}
 
diff --git a/src/include/nmtkit/word_vocabulary.h b/src/include/nmtkit/word_vocabulary.h
index e4ca18e..98862d0 100644
--- a/src/include/nmtkit/word_vocabulary.h
+++ b/src/include/nmtkit/word_vocabulary.h
@@ -31,8 +31,9 @@ class WordVocabulary : public Vocabulary {
   //
   // Arguments:
   //   corpus_filename: Location of the corpus file to be analyzed.
+  //   unk_frequency: A parameter to which word will be treated as unk.
   //   size: Size of the vocabulary.
-  WordVocabulary(const std::string & corpus_filename, unsigned size);
+  WordVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);
 
   ~WordVocabulary() override {}
 
diff --git a/src/lib/character_vocabulary.cc b/src/lib/character_vocabulary.cc
index d91d5ce..1aac5e4 100644
--- a/src/lib/character_vocabulary.cc
+++ b/src/lib/character_vocabulary.cc
@@ -50,8 +50,10 @@ namespace nmtkit {
 
 CharacterVocabulary::CharacterVocabulary(
     const string & corpus_filename,
+    unsigned unk_frequency,
     unsigned size) {
-  NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
+  NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
+  NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
   ifstream ifs(corpus_filename);
   NMTKIT_CHECK(
       ifs.is_open(),
@@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary(
   freq_.emplace_back(num_letters);
   freq_.emplace_back(num_lines);
   freq_.emplace_back(num_lines);
-  for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {
+
+  unsigned longest_size = size;
+  if (unk_frequency > 0) {
+    longest_size = entries.size();
+  }
+
+  for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
     const auto & entry = entries[i - 3];
     stoi_[entry.second] = i;
     itos_.emplace_back(entry.second);
diff --git a/src/lib/word_vocabulary.cc b/src/lib/word_vocabulary.cc
index e57230f..85a6333 100644
--- a/src/lib/word_vocabulary.cc
+++ b/src/lib/word_vocabulary.cc
@@ -13,8 +13,9 @@ using namespace std;
 
 namespace nmtkit {
 
-WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
-  NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
+WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) {
+  NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
+  NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
   ifstream ifs(corpus_filename);
   NMTKIT_CHECK(
       ifs.is_open(),
@@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
   // Selects most frequent words.
   vector<pair<unsigned, string>> entries;
   for (const auto & entry : freq) {
-    entries.emplace_back(make_pair(entry.second, entry.first));
+      entries.emplace_back(make_pair(entry.second, entry.first));
   }
   Array::sort(&entries, greater<pair<unsigned, string>>());
 
@@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
   freq_.emplace_back(num_words);
   freq_.emplace_back(num_lines);
   freq_.emplace_back(num_lines);
-  for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {
+
+  unsigned longest_size = size;
+  if (unk_frequency > 0) {
+    longest_size = entries.size();
+  }
+
+  for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
     const auto & entry = entries[i - 3];
     stoi_[entry.second] = i;
     itos_.emplace_back(entry.second);
     freq_.emplace_back(entry.first);
     freq_[0] -= entry.first;
   }
+
+
 }
 
 unsigned WordVocabulary::getID(const string & word) const {