Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sample_data/sample_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ source_vocabulary_type=word
target_vocabulary_type=word

; Vocabulary size in each side.
unk_frequency=0
Copy link
Owner

@odashi odashi Jan 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need detailed description for this option.
And I guess this option has multiple meanings:

  • choosing filtering strategy as either by-frequency or by-ranking,
  • specifying the threshold of unknown words in both source/target languages.

Maybe they could be separated into some unique options. For example:

unk_filter_type=frequency/rank
source_unk_frequency=3 (only used when type=frequency)
target_unk_frequency=4 (ditto)
source_vocabulary_size=4100 (only used when type=rank)
target_vocabulary_size=4900 (ditto)

source_vocabulary_size=4100
target_vocabulary_size=4900

Expand Down
3 changes: 2 additions & 1 deletion sample_data/tiny_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ test_target=sample_data/tiny.out
[Model]
source_vocabulary_type=word
target_vocabulary_type=word
source_vocabulary_size=30
unk_frequency=0
source_vocabulary_size=33
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

back to 30.

target_vocabulary_size=33
encoder_type=bidirectional
decoder_type=default
Expand Down
8 changes: 6 additions & 2 deletions src/bin/train.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ void initializeLogger(
// Arguments:
// corpus_filepath: Location of the corpus file to be analyzed.
// vocab_type: Name of the vocabulary type.
// unk_frequency: Frequency of unknown word.
// vocab_size: Number of entries in the vocabulary.
//
// Returns:
Expand All @@ -183,11 +184,12 @@ void initializeLogger(
nmtkit::Vocabulary * createVocabulary(
Copy link
Owner

@odashi odashi Jan 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think basically one parameter should have only one meaning to prevent abusing them. CharacterVocabulary and WordVocabulary could take more 1 parameter to choose unk filtering strategy (just specified in config file) to prevent increasing tne number of meanings in unk_frequency.

const string & corpus_filepath,
const string & vocab_type,
const unsigned unk_frequency,
const unsigned vocab_size) {
if (vocab_type == "character") {
return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size);
return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size);
} else if (vocab_type == "word") {
return new nmtkit::WordVocabulary(corpus_filepath, vocab_size);
return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size);
}
NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type);
}
Expand Down Expand Up @@ -370,11 +372,13 @@ int main(int argc, char * argv[]) {
::createVocabulary(
config.get<string>("Corpus.train_source"),
config.get<string>("Model.source_vocabulary_type"),
config.get<unsigned>("Model.unk_frequency"),
config.get<unsigned>("Model.source_vocabulary_size")));
boost::scoped_ptr<nmtkit::Vocabulary> trg_vocab(
::createVocabulary(
config.get<string>("Corpus.train_target"),
config.get<string>("Model.target_vocabulary_type"),
config.get<unsigned>("Model.unk_frequency"),
config.get<unsigned>("Model.target_vocabulary_size")));
::saveArchive(model_dir / "source.vocab", archive_format, src_vocab);
::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab);
Expand Down
2 changes: 1 addition & 1 deletion src/include/nmtkit/character_vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class CharacterVocabulary : public Vocabulary {
// Arguments:
// corpus_filename: Location of the corpus file to be analyzed.
// size: Size of the vocabulary.
CharacterVocabulary(const std::string & corpus_filename, unsigned size);
CharacterVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);

~CharacterVocabulary() override {}

Expand Down
3 changes: 2 additions & 1 deletion src/include/nmtkit/word_vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ class WordVocabulary : public Vocabulary {
//
// Arguments:
// corpus_filename: Location of the corpus file to be analyzed.
// unk_frequency: A parameter to which word will be treated as unk.
// size: Size of the vocabulary.
WordVocabulary(const std::string & corpus_filename, unsigned size);
WordVocabulary(const std::string & corpus_filename, unsigned unk_frequency, unsigned size);

~WordVocabulary() override {}

Expand Down
12 changes: 10 additions & 2 deletions src/lib/character_vocabulary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ namespace nmtkit {

CharacterVocabulary::CharacterVocabulary(
const string & corpus_filename,
unsigned unk_frequency,
Copy link
Owner

@odashi odashi Jan 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some test code in src/test/character_vocabulary_test.cc for unk_frequency?

unsigned size) {
NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
ifstream ifs(corpus_filename);
NMTKIT_CHECK(
ifs.is_open(),
Expand Down Expand Up @@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary(
freq_.emplace_back(num_letters);
freq_.emplace_back(num_lines);
freq_.emplace_back(num_lines);
for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {

unsigned longest_size = size;
if (unk_frequency > 0) {
longest_size = entries.size();
}

for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
const auto & entry = entries[i - 3];
stoi_[entry.second] = i;
itos_.emplace_back(entry.second);
Expand Down
17 changes: 13 additions & 4 deletions src/lib/word_vocabulary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ using namespace std;

namespace nmtkit {

WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3.");
WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) {
Copy link
Owner

@odashi odashi Jan 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some test code in src/test/word_vocabulary_test.cc for unk_frequency?

NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified.");
NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3.");
ifstream ifs(corpus_filename);
NMTKIT_CHECK(
ifs.is_open(),
Expand All @@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
// Selects most frequent words.
vector<pair<unsigned, string>> entries;
for (const auto & entry : freq) {
entries.emplace_back(make_pair(entry.second, entry.first));
entries.emplace_back(make_pair(entry.second, entry.first));
}
Array::sort(&entries, greater<pair<unsigned, string>>());

Expand All @@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) {
freq_.emplace_back(num_words);
freq_.emplace_back(num_lines);
freq_.emplace_back(num_lines);
for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) {

unsigned longest_size = size;
if (unk_frequency > 0) {
longest_size = entries.size();
}

for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) {
const auto & entry = entries[i - 3];
stoi_[entry.second] = i;
itos_.emplace_back(entry.second);
freq_.emplace_back(entry.first);
freq_[0] -= entry.first;
}


}

unsigned WordVocabulary::getID(const string & word) const {
Expand Down