scrapinghub · arnavkapoor · Aug 14, 2020 · Aug 7, 2020 · Aug 7, 2020 · Aug 12, 2020
diff --git a/README.rst b/README.rst
@@ -7,7 +7,7 @@ number-parser
 
 ``number-parser`` is a simple library that allows you to convert numbers written in the natural
 language to it's equivalent numeric forms. It currently supports cardinal numbers in the following 
-languages - English, Hindi, Spanish and Russian.
+languages - English, Hindi, Spanish and Russian and ordinal numbers in English.
 
 Installation
 ============
@@ -20,18 +20,21 @@ number-parser requires Python 3.6+.
 Usage
 =====
 
-The library provides two major APIs which corresponds to the following two common usages.
+The library provides three major APIs which corresponds to the following common usages.
 
 Interface #1: Multiple numbers 
 ------------------------------
 
 Identifying the numbers in a text string, converting them to corresponding numeric values while ignoring non-numeric words.
+This also supports ordinal number conversion (for English only).
 
 >>> from number_parser import parse
 >>> parse("I have two hats and thirty seven coats")
 'I have 2 hats and 37 coats'
 >>> parse("One, Two, Three go")
 '1, 2, 3 go'
+>>> parse("First day of year two thousand")
+'1 day of year 2000'
 
 
 Interface #2: Single number 
@@ -41,21 +44,33 @@ Converting a single number written in words to it's corresponding integer.
 >>> from number_parser import parse_number
 >>> parse_number("two thousand and twenty")
 2020
->>> output = parse_number("not_a_number")
->>> output
-None
+>>> parse_number("not_a_number")
+
+
+Interface #3: Single number Ordinal
+-------------------------------------
+
+Converting a single ordinal number written in words to it's corresponding integer. (Support for only English)
+
+>>> from number_parser import parse_ordinal
+>>> parse_ordinal("twenty third")
+23
+>>> parse_ordinal("seventy fifth")
+75
 
 
 Language Support
 ----------------
 
 The default language is English, you can pass the language parameter with corresponding locale for other languages.
+It currently supports cardinal numbers in the following 
+languages - English, Hindi, Spanish and Russian and ordinal numbers in English.
 
 >>> from number_parser import parse, parse_number
 >>> parse("Hay tres gallinas y veintitrés patos", language='es')
 'Hay 3 gallinas y 23 patos'
 >>> parse_number("चौदह लाख बत्तीस हज़ार पाँच सौ चौबीस", language='hi')
-1432524 
+1432524
 
 Supported cases
 ---------------
@@ -72,7 +87,7 @@ Accurately handling usage of conjunction while forming the number.
 Handling ambiguous cases without proper separators.
 
 >>> parse("two thousand thousand")
-2000 1000
+'2000 1000'
 >>> parse_number("two thousand two million")
 2002000000
 

diff --git a/number_parser/__init__.py b/number_parser/__init__.py
@@ -1 +1 @@
-from number_parser.parser import parse, parse_number
+from number_parser.parser import parse, parse_number, parse_ordinal
diff --git a/number_parser/parser.py b/number_parser/parser.py
@@ -153,9 +153,61 @@ def _normalize_tokens(token_list):
     return [_strip_accents(token.lower()) for token in token_list]
 
 
-def _normalize_dict(lang_dict):
+def _normalize_dict(lang_data):
     """Removes the accent from each key of input dictionary"""
-    return {_strip_accents(word): number for word, number in lang_dict.items()}
+    return {_strip_accents(word): number for word, number in lang_data.items()}
+
+
+def _is_cardinal_token(token, lang_data):
+    """Checks if the given token is a cardinal number and returns token"""
+    if token in lang_data.all_numbers:
+        return token
+    return None
+
+
+def _is_ordinal_token(token, lang_data):
+    """Checks if the given token is a ordinal number and returns token"""
+    if _is_cardinal_token(token, lang_data) is None:
+        return _is_number_token(token, lang_data)
+    return None
+
+
+def _is_number_token(token, lang_data):
+    """
+    Checks if the given token belongs to either cardinal or ordinal numbers
+    and returns the cardinal form.
+    """
+    token = _apply_cardinal_conversion(token, lang_data)
+    return _is_cardinal_token(token, lang_data)
+
+
+def _apply_cardinal_conversion(token, lang_data):  # Currently only for English language.
+    """Converts ordinal tokens to cardinal while leaving other tokens unchanged."""
+    CARDINAL_DIRECT_NUMBERS = {'first': 'one', 'second': 'two', 'third': 'three', 'fifth': 'five', 'eighth': 'eight',
+                               'ninth': 'nine', 'twelfth': 'twelve'}
+
+    for word, number in CARDINAL_DIRECT_NUMBERS.items():
+        token = token.replace(word, number)
+
+    token_cardinal_form_1 = re.sub(r'ieth$', 'y', token)
+    if _is_cardinal_token(token_cardinal_form_1, lang_data) is not None:
+        return token_cardinal_form_1
+
+    token_cardinal_form_2 = re.sub(r'th$', '', token)
+    if _is_cardinal_token(token_cardinal_form_2, lang_data) is not None:
+        return token_cardinal_form_2
+
+    return token
+
+
+def parse_ordinal(input_string, language='en'):
+    """Converts a single number in ordinal or cardinal form to it's numeric equivalent"""
+    lang_data = LanguageData(language)
+    tokens = _tokenize(input_string, language)
+    normalized_tokens = _normalize_tokens(tokens)
+    processed_tokens = [_apply_cardinal_conversion(token, lang_data) for token in normalized_tokens]
+    output_string = ' '.join(processed_tokens)
+    return parse_number(output_string, language)
 
 
 def parse_number(input_string, language='en'):
@@ -194,12 +246,14 @@ def parse(input_string, language='en'):
 
     for token in tokens:
         compare_token = _strip_accents(token.lower())
+        ordinal_number = _is_ordinal_token(compare_token, lang_data)
+
         if compare_token.isspace() or compare_token == "":
             if not tokens_taken:
                 current_sentence.append(token)
             continue
 
-        elif compare_token in SENTENCE_SEPARATORS:
+        if compare_token in SENTENCE_SEPARATORS:
             if tokens_taken:
                 myvalue = _build_number(tokens_taken, lang_data)
                 for each_number in myvalue:
@@ -212,18 +266,25 @@ def parse(input_string, language='en'):
             current_sentence = []
             continue
 
-        elif (compare_token in lang_data.all_numbers or
-                (compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)):
+        elif ((compare_token in lang_data.all_numbers or
+                (compare_token in lang_data.skip_tokens and len(tokens_taken) != 0)) and ordinal_number is None):
             tokens_taken.append(compare_token)
 
         else:
+            if ordinal_number is not None:
+                tokens_taken.append(ordinal_number)
+
             if tokens_taken:
                 myvalue = _build_number(tokens_taken, lang_data)
                 for each_number in myvalue:
                     current_sentence.append(each_number)
                     current_sentence.append(" ")
                 tokens_taken = []
-            current_sentence.append(token)
+
+            if ordinal_number is None:
+                current_sentence.append(token)
+            else:
+                current_sentence.pop()  # Handling extra space when breaking on ordinal numbers.
 
     if tokens_taken:
         myvalue = _build_number(tokens_taken, lang_data)

diff --git a/tests/test_language_en.py b/tests/test_language_en.py
@@ -1,5 +1,5 @@
 import pytest
-from number_parser import parse, parse_number
+from number_parser import parse, parse_number, parse_ordinal
 from tests import HUNDREDS_DIRECTORY, PERMUTATION_DIRECTORY
 from tests import _test_files
 LANG = 'en'
@@ -95,3 +95,46 @@ def test_parse_number_till_hundred(self):
 
     def test_parse_number_permutations(self):
         _test_files(PERMUTATION_DIRECTORY, LANG)
+
+    @pytest.mark.parametrize(
+        "test_input,expected",
+        [
+            ('eleventh', 11),
+            ("nineteenth", 19),
+            ('hundredth', 100),
+            ('one hundred and forty second', 142),
+            ('thousandth', 1_000),
+            ("two thousand and fifth", 2_005),
+            ('millionth', 1_000_000),
+            ("two million three thousand and nineteenth", 2_003_019),
+            ('two million twenty three thousand and forty ninth', 2_023_049),
+            ("two million three thousand nine hundred and eighty fourth", 2_003_984),
+            ('billionth', 1_000_000_000),
+            ('with goldsmith', None),
+            ('th th', None),
+            ('fifth fiftieth', None),
+            # Some ambiguos cases
+            ('fiftieth fifth', 55),
+            ('fiftieth five', 55),
+            ('fifty five', 55)
+        ]
+    )
+    def test_parse_ordinal(self, expected, test_input):
+        assert parse_ordinal(test_input, LANG) == expected
+
+    @pytest.mark.parametrize(
+        "test_input,expected",
+        [
+            ('eleventh day of summer', "11 day of summer"),
+            ("nineteenth may two thousand", "19 may 2000"),
+            ('hundredth and one', "100 and 1"),
+            ('one hundred and forty second', "142"),
+            ('five thousandth and one', "5000 and 1"),
+            ("thirty seven and fifth", "37 and 5"),
+            ('eighth month of year two thousand and twentieth', "8 month of year 2020"),
+            ('He crieth, a path with fifty fifth steps', "He crieth, a path with 55 steps"),
+            ('twentieth seventh fiftieth third', "20 7 50 3")
+        ]
+    )
+    def test_parse_sentences_ordinal(self, expected, test_input):
+        assert parse(test_input, LANG) == expected
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from number_parser.parser import parse, parse_number
		from number_parser.parser import parse, parse_number, parse_ordinal