Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ number-parser

``number-parser`` is a simple library that allows you to convert numbers written in the natural
language to it's equivalent numeric forms. It currently supports cardinal numbers in the following
languages - English, Hindi, Spanish and Russian.
languages - English, Hindi, Spanish and Russian and ordinal numbers in English.

Installation
============
Expand All @@ -20,18 +20,21 @@ number-parser requires Python 3.6+.
Usage
=====

The library provides two major APIs which corresponds to the following two common usages.
The library provides three major APIs which corresponds to the following common usages.

Interface #1: Multiple numbers
------------------------------

Identifying the numbers in a text string, converting them to corresponding numeric values while ignoring non-numeric words.
This also supports ordinal number conversion (for English only).

>>> from number_parser import parse
>>> parse("I have two hats and thirty seven coats")
'I have 2 hats and 37 coats'
>>> parse("One, Two, Three go")
'1, 2, 3 go'
>>> parse("First day of year two thousand")
'1 day of year 2000'


Interface #2: Single number
Expand All @@ -41,21 +44,33 @@ Converting a single number written in words to it's corresponding integer.
>>> from number_parser import parse_number
>>> parse_number("two thousand and twenty")
2020
>>> output = parse_number("not_a_number")
>>> output
None
>>> parse_number("not_a_number")


Interface #3: Single number Ordinal
-------------------------------------

Converting a single ordinal number written in words to it's corresponding integer. (Support for only English)

>>> from number_parser import parse_ordinal
>>> parse_ordinal("twenty third")
23
>>> parse_ordinal("seventy fifth")
75


Language Support
----------------

The default language is English, you can pass the language parameter with corresponding locale for other languages.
It currently supports cardinal numbers in the following
languages - English, Hindi, Spanish and Russian and ordinal numbers in English.

>>> from number_parser import parse, parse_number
>>> parse("Hay tres gallinas y veintitrés patos", language='es')
'Hay 3 gallinas y 23 patos'
>>> parse_number("चौदह लाख बत्तीस हज़ार पाँच सौ चौबीस", language='hi')
1432524
1432524

Supported cases
---------------
Expand All @@ -72,7 +87,7 @@ Accurately handling usage of conjunction while forming the number.
Handling ambiguous cases without proper separators.

>>> parse("two thousand thousand")
2000 1000
'2000 1000'
>>> parse_number("two thousand two million")
2002000000

Expand Down
2 changes: 1 addition & 1 deletion number_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from number_parser.parser import parse, parse_number
from number_parser.parser import parse, parse_number, parse_ordinal
73 changes: 67 additions & 6 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,61 @@ def _normalize_tokens(token_list):
return [_strip_accents(token.lower()) for token in token_list]


def _normalize_dict(lang_dict):
def _normalize_dict(lang_data):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renaming the parameter to lang_data everywhere from lang_dict

"""Removes the accent from each key of input dictionary"""
return {_strip_accents(word): number for word, number in lang_dict.items()}
return {_strip_accents(word): number for word, number in lang_data.items()}


def _is_cardinal_token(token, lang_data):
"""Checks if the given token is a cardinal number and returns token"""
if token in lang_data.all_numbers:
return token
return None


def _is_ordinal_token(token, lang_data):
"""Checks if the given token is a ordinal number and returns token"""
if _is_cardinal_token(token, lang_data) is None:
return _is_number_token(token, lang_data)
return None


def _is_number_token(token, lang_data):
"""
Checks if the given token belongs to either cardinal or ordinal numbers
and returns the cardinal form.
"""
token = _apply_cardinal_conversion(token, lang_data)
return _is_cardinal_token(token, lang_data)


def _apply_cardinal_conversion(token, lang_data): # Currently only for English language.
"""Converts ordinal tokens to cardinal while leaving other tokens unchanged."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment like # Note: For now it supports only English?

CARDINAL_DIRECT_NUMBERS = {'first': 'one', 'second': 'two', 'third': 'three', 'fifth': 'five', 'eighth': 'eight',
'ninth': 'nine', 'twelfth': 'twelve'}

for word, number in CARDINAL_DIRECT_NUMBERS.items():
token = token.replace(word, number)

token_cardinal_form_1 = re.sub(r'ieth$', 'y', token)
if _is_cardinal_token(token_cardinal_form_1, lang_data) is not None:
return token_cardinal_form_1

token_cardinal_form_2 = re.sub(r'th$', '', token)
if _is_cardinal_token(token_cardinal_form_2, lang_data) is not None:
return token_cardinal_form_2

return token


def parse_ordinal(input_string, language='en'):
Copy link
Collaborator Author

@arnavkapoor arnavkapoor Aug 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_ordinal acts as the third interface after parse , parse_number . All these interfaces have the language parameter while the rest use the lang_data.
This currently converts all tokens to their cardinal form and then gives it to parse_number , which means that using parse_ordinal with only cardinal , cardinal/ordinal mixture and multiple ordinals all work , which is not completely accurate. (added these cases in tests)

"""Converts a single number in ordinal or cardinal form to it's numeric equivalent"""
lang_data = LanguageData(language)
tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
processed_tokens = [_apply_cardinal_conversion(token, lang_data) for token in normalized_tokens]
output_string = ' '.join(processed_tokens)
return parse_number(output_string, language)


def parse_number(input_string, language='en'):
Expand Down Expand Up @@ -194,12 +246,14 @@ def parse(input_string, language='en'):

for token in tokens:
compare_token = _strip_accents(token.lower())
ordinal_number = _is_ordinal_token(compare_token, lang_data)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This parameter is added and used to break the processing when we encounter an ordinal number as it indicates the end of current processing. This is to ensure cases like twentieth seventh fiftieth third work

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add a similar sort of logic to parse_ordinal to break on encountering an ordinal number. This would then reject mixed cardinal/ordinal like twentieth three


if compare_token.isspace() or compare_token == "":
if not tokens_taken:
current_sentence.append(token)
continue

elif compare_token in SENTENCE_SEPARATORS:
if compare_token in SENTENCE_SEPARATORS:
if tokens_taken:
myvalue = _build_number(tokens_taken, lang_data)
for each_number in myvalue:
Expand All @@ -212,18 +266,25 @@ def parse(input_string, language='en'):
current_sentence = []
continue

elif (compare_token in lang_data.all_numbers or
(compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)):
elif ((compare_token in lang_data.all_numbers or
(compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)) and ordinal_number is None):
tokens_taken.append(compare_token)

else:
if ordinal_number is not None:
tokens_taken.append(ordinal_number)

if tokens_taken:
myvalue = _build_number(tokens_taken, lang_data)
for each_number in myvalue:
current_sentence.append(each_number)
current_sentence.append(" ")
tokens_taken = []
current_sentence.append(token)

if ordinal_number is None:
current_sentence.append(token)
else:
current_sentence.pop() # Handling extra space when breaking on ordinal numbers.

if tokens_taken:
myvalue = _build_number(tokens_taken, lang_data)
Expand Down
45 changes: 44 additions & 1 deletion tests/test_language_en.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from number_parser import parse, parse_number
from number_parser import parse, parse_number, parse_ordinal
from tests import HUNDREDS_DIRECTORY, PERMUTATION_DIRECTORY
from tests import _test_files
LANG = 'en'
Expand Down Expand Up @@ -95,3 +95,46 @@ def test_parse_number_till_hundred(self):

def test_parse_number_permutations(self):
_test_files(PERMUTATION_DIRECTORY, LANG)

@pytest.mark.parametrize(
"test_input,expected",
[
('eleventh', 11),
("nineteenth", 19),
('hundredth', 100),
('one hundred and forty second', 142),
('thousandth', 1_000),
("two thousand and fifth", 2_005),
('millionth', 1_000_000),
("two million three thousand and nineteenth", 2_003_019),
('two million twenty three thousand and forty ninth', 2_023_049),
("two million three thousand nine hundred and eighty fourth", 2_003_984),
('billionth', 1_000_000_000),
('with goldsmith', None),
('th th', None),
('fifth fiftieth', None),
# Some ambiguos cases
('fiftieth fifth', 55),
('fiftieth five', 55),
('fifty five', 55)
]
)
def test_parse_ordinal(self, expected, test_input):
assert parse_ordinal(test_input, LANG) == expected

@pytest.mark.parametrize(
"test_input,expected",
[
('eleventh day of summer', "11 day of summer"),
("nineteenth may two thousand", "19 may 2000"),
('hundredth and one', "100 and 1"),
('one hundred and forty second', "142"),
('five thousandth and one', "5000 and 1"),
("thirty seven and fifth", "37 and 5"),
('eighth month of year two thousand and twentieth', "8 month of year 2020"),
('He crieth, a path with fifty fifth steps', "He crieth, a path with 55 steps"),
('twentieth seventh fiftieth third', "20 7 50 3")
]
)
def test_parse_sentences_ordinal(self, expected, test_input):
assert parse(test_input, LANG) == expected