-
Notifications
You must be signed in to change notification settings - Fork 7
Unk frequency #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,8 @@ test_target=sample_data/tiny.out | |
| [Model] | ||
| source_vocabulary_type=word | ||
| target_vocabulary_type=word | ||
| source_vocabulary_size=30 | ||
| unk_frequency=0 | ||
| source_vocabulary_size=33 | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. back to 30. |
||
| target_vocabulary_size=33 | ||
| encoder_type=bidirectional | ||
| decoder_type=default | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -172,6 +172,7 @@ void initializeLogger( | |
| // Arguments: | ||
| // corpus_filepath: Location of the corpus file to be analyzed. | ||
| // vocab_type: Name of the vocabulary type. | ||
| // unk_frequency: Frequency of unknown word. | ||
| // vocab_size: Number of entries in the vocabulary. | ||
| // | ||
| // Returns: | ||
|
|
@@ -183,11 +184,12 @@ void initializeLogger( | |
| nmtkit::Vocabulary * createVocabulary( | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think basically one parameter should have only one meaning to prevent abusing them. CharacterVocabulary and WordVocabulary could take more 1 parameter to choose unk filtering strategy (just specified in config file) to prevent increasing tne number of meanings in unk_frequency. |
||
| const string & corpus_filepath, | ||
| const string & vocab_type, | ||
| const unsigned unk_frequency, | ||
| const unsigned vocab_size) { | ||
| if (vocab_type == "character") { | ||
| return new nmtkit::CharacterVocabulary(corpus_filepath, vocab_size); | ||
| return new nmtkit::CharacterVocabulary(corpus_filepath, unk_frequency, vocab_size); | ||
| } else if (vocab_type == "word") { | ||
| return new nmtkit::WordVocabulary(corpus_filepath, vocab_size); | ||
| return new nmtkit::WordVocabulary(corpus_filepath, unk_frequency, vocab_size); | ||
| } | ||
| NMTKIT_FATAL("Invalid vocabulary type: " + vocab_type); | ||
| } | ||
|
|
@@ -370,11 +372,13 @@ int main(int argc, char * argv[]) { | |
| ::createVocabulary( | ||
| config.get<string>("Corpus.train_source"), | ||
| config.get<string>("Model.source_vocabulary_type"), | ||
| config.get<unsigned>("Model.unk_frequency"), | ||
| config.get<unsigned>("Model.source_vocabulary_size"))); | ||
| boost::scoped_ptr<nmtkit::Vocabulary> trg_vocab( | ||
| ::createVocabulary( | ||
| config.get<string>("Corpus.train_target"), | ||
| config.get<string>("Model.target_vocabulary_type"), | ||
| config.get<unsigned>("Model.unk_frequency"), | ||
| config.get<unsigned>("Model.target_vocabulary_size"))); | ||
| ::saveArchive(model_dir / "source.vocab", archive_format, src_vocab); | ||
| ::saveArchive(model_dir / "target.vocab", archive_format, trg_vocab); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,8 +50,10 @@ namespace nmtkit { | |
|
|
||
| CharacterVocabulary::CharacterVocabulary( | ||
| const string & corpus_filename, | ||
| unsigned unk_frequency, | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add some test code in src/test/character_vocabulary_test.cc for unk_frequency? |
||
| unsigned size) { | ||
| NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); | ||
| NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); | ||
| NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); | ||
| ifstream ifs(corpus_filename); | ||
| NMTKIT_CHECK( | ||
| ifs.is_open(), | ||
|
|
@@ -88,7 +90,13 @@ CharacterVocabulary::CharacterVocabulary( | |
| freq_.emplace_back(num_letters); | ||
| freq_.emplace_back(num_lines); | ||
| freq_.emplace_back(num_lines); | ||
| for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { | ||
|
|
||
| unsigned longest_size = size; | ||
| if (unk_frequency > 0) { | ||
| longest_size = entries.size(); | ||
| } | ||
|
|
||
| for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { | ||
| const auto & entry = entries[i - 3]; | ||
| stoi_[entry.second] = i; | ||
| itos_.emplace_back(entry.second); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,8 +13,9 @@ using namespace std; | |
|
|
||
| namespace nmtkit { | ||
|
|
||
| WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { | ||
| NMTKIT_CHECK(size >= 3, "Size should be equal or greater than 3."); | ||
| WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned unk_frequency, unsigned size) { | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add some test code in src/test/word_vocabulary_test.cc for unk_frequency? |
||
| NMTKIT_CHECK(!(unk_frequency == 0 && size == 0), "Either size or unk_frequency must be specified."); | ||
| NMTKIT_CHECK(unk_frequency > 0 || size >= 3, "Size should be equal or greater than 3."); | ||
| ifstream ifs(corpus_filename); | ||
| NMTKIT_CHECK( | ||
| ifs.is_open(), | ||
|
|
@@ -39,7 +40,7 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { | |
| // Selects most frequent words. | ||
| vector<pair<unsigned, string>> entries; | ||
| for (const auto & entry : freq) { | ||
| entries.emplace_back(make_pair(entry.second, entry.first)); | ||
| entries.emplace_back(make_pair(entry.second, entry.first)); | ||
| } | ||
| Array::sort(&entries, greater<pair<unsigned, string>>()); | ||
|
|
||
|
|
@@ -53,13 +54,21 @@ WordVocabulary::WordVocabulary(const string & corpus_filename, unsigned size) { | |
| freq_.emplace_back(num_words); | ||
| freq_.emplace_back(num_lines); | ||
| freq_.emplace_back(num_lines); | ||
| for (unsigned i = 3; i < size && i - 3 < entries.size(); ++i) { | ||
|
|
||
| unsigned longest_size = size; | ||
| if (unk_frequency > 0) { | ||
| longest_size = entries.size(); | ||
| } | ||
|
|
||
| for (unsigned i = 3; i < longest_size && i - 3 < entries.size() && entries[i-3].first > unk_frequency; ++i) { | ||
| const auto & entry = entries[i - 3]; | ||
| stoi_[entry.second] = i; | ||
| itos_.emplace_back(entry.second); | ||
| freq_.emplace_back(entry.first); | ||
| freq_[0] -= entry.first; | ||
| } | ||
|
|
||
|
|
||
| } | ||
|
|
||
| unsigned WordVocabulary::getID(const string & word) const { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need detailed description for this option.
And I guess this option has multiple meanings:
Maybe they could be separated into some unique options. For example: