-
Notifications
You must be signed in to change notification settings - Fork 25
Restructure code for ordinal support #35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a08a84f
a350f46
465a868
b779c7d
66f688b
08bee2f
3392bdf
5fab19a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| from number_parser.parser import parse, parse_number | ||
| from number_parser.parser import parse, parse_number, parse_ordinal |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -153,9 +153,61 @@ def _normalize_tokens(token_list): | |
| return [_strip_accents(token.lower()) for token in token_list] | ||
|
|
||
|
|
||
| def _normalize_dict(lang_dict): | ||
| def _normalize_dict(lang_data): | ||
| """Removes the accent from each key of input dictionary""" | ||
| return {_strip_accents(word): number for word, number in lang_dict.items()} | ||
| return {_strip_accents(word): number for word, number in lang_data.items()} | ||
|
|
||
|
|
||
| def _is_cardinal_token(token, lang_data): | ||
| """Checks if the given token is a cardinal number and returns token""" | ||
| if token in lang_data.all_numbers: | ||
| return token | ||
| return None | ||
|
|
||
|
|
||
| def _is_ordinal_token(token, lang_data): | ||
| """Checks if the given token is a ordinal number and returns token""" | ||
| if _is_cardinal_token(token, lang_data) is None: | ||
| return _is_number_token(token, lang_data) | ||
| return None | ||
|
|
||
|
|
||
| def _is_number_token(token, lang_data): | ||
| """ | ||
| Checks if the given token belongs to either cardinal or ordinal numbers | ||
| and returns the cardinal form. | ||
| """ | ||
| token = _apply_cardinal_conversion(token, lang_data) | ||
| return _is_cardinal_token(token, lang_data) | ||
|
|
||
|
|
||
| def _apply_cardinal_conversion(token, lang_data): # Currently only for English language. | ||
| """Converts ordinal tokens to cardinal while leaving other tokens unchanged.""" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a comment like |
||
| CARDINAL_DIRECT_NUMBERS = {'first': 'one', 'second': 'two', 'third': 'three', 'fifth': 'five', 'eighth': 'eight', | ||
| 'ninth': 'nine', 'twelfth': 'twelve'} | ||
|
|
||
| for word, number in CARDINAL_DIRECT_NUMBERS.items(): | ||
| token = token.replace(word, number) | ||
|
|
||
| token_cardinal_form_1 = re.sub(r'ieth$', 'y', token) | ||
| if _is_cardinal_token(token_cardinal_form_1, lang_data) is not None: | ||
| return token_cardinal_form_1 | ||
|
|
||
| token_cardinal_form_2 = re.sub(r'th$', '', token) | ||
| if _is_cardinal_token(token_cardinal_form_2, lang_data) is not None: | ||
| return token_cardinal_form_2 | ||
|
|
||
| return token | ||
|
|
||
|
|
||
| def parse_ordinal(input_string, language='en'): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| """Converts a single number in ordinal or cardinal form to it's numeric equivalent""" | ||
| lang_data = LanguageData(language) | ||
| tokens = _tokenize(input_string, language) | ||
| normalized_tokens = _normalize_tokens(tokens) | ||
| processed_tokens = [_apply_cardinal_conversion(token, lang_data) for token in normalized_tokens] | ||
| output_string = ' '.join(processed_tokens) | ||
| return parse_number(output_string, language) | ||
|
|
||
|
|
||
| def parse_number(input_string, language='en'): | ||
|
|
@@ -194,12 +246,14 @@ def parse(input_string, language='en'): | |
|
|
||
| for token in tokens: | ||
| compare_token = _strip_accents(token.lower()) | ||
| ordinal_number = _is_ordinal_token(compare_token, lang_data) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This parameter is added and used to break the processing when we encounter an ordinal number as it indicates the end of current processing. This is to ensure cases like
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can add a similar sort of logic to |
||
|
|
||
| if compare_token.isspace() or compare_token == "": | ||
| if not tokens_taken: | ||
| current_sentence.append(token) | ||
| continue | ||
|
|
||
| elif compare_token in SENTENCE_SEPARATORS: | ||
| if compare_token in SENTENCE_SEPARATORS: | ||
| if tokens_taken: | ||
| myvalue = _build_number(tokens_taken, lang_data) | ||
| for each_number in myvalue: | ||
|
|
@@ -212,18 +266,25 @@ def parse(input_string, language='en'): | |
| current_sentence = [] | ||
| continue | ||
|
|
||
| elif (compare_token in lang_data.all_numbers or | ||
| (compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)): | ||
| elif ((compare_token in lang_data.all_numbers or | ||
| (compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)) and ordinal_number is None): | ||
| tokens_taken.append(compare_token) | ||
|
|
||
| else: | ||
| if ordinal_number is not None: | ||
| tokens_taken.append(ordinal_number) | ||
|
|
||
| if tokens_taken: | ||
| myvalue = _build_number(tokens_taken, lang_data) | ||
| for each_number in myvalue: | ||
| current_sentence.append(each_number) | ||
| current_sentence.append(" ") | ||
| tokens_taken = [] | ||
| current_sentence.append(token) | ||
|
|
||
| if ordinal_number is None: | ||
| current_sentence.append(token) | ||
| else: | ||
| current_sentence.pop() # Handling extra space when breaking on ordinal numbers. | ||
|
|
||
| if tokens_taken: | ||
| myvalue = _build_number(tokens_taken, lang_data) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
renaming the parameter to
lang_dataeverywhere fromlang_dict